In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
# Import necessary modules
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
import nltk
import spacy
!spacy download en_core_web_sm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import SyllableTokenizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from spacy.lang.en.stop_words import STOP_WORDS

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Data Cleaning and Analysis Functions

In [None]:
# Scrape the data
def scrape_data(url):
  html_text = requests.get(url).text
  soup = BeautifulSoup(html_text, 'lxml')

  if soup:
    titles = soup.find('h1', class_ = ['post-title', 'tdb-title-text'])

  if titles == None:
    return None
  # Get all the titles in the article
  for title in titles:
      title_name = title.text

  trying = soup.find_all('div', class_ = ['post-content', 'tdb-block-inner td-fix-index'])
  new_page = []

  # Get all the content using the 'p' tag because the content of the articleis stored in 'p' tags
  for trial in trying:
      # In each 'div' tag if there is no 'p' tag, move on to the next 'div' tag
      if trial.find_all('p') == []:
          continue
      else:
          # If there is a 'p' tag, find them  all and save them in the variable 'new page'
          new_page = trial.find_all('p')

  # The entire article is not saved in just 1 'p' tag but several
  for i in range(len(new_page)):

    # for each 'p' tag, save only the text
    new_page[i] = new_page[i].text

  page = ' '
  # Convert the list into a single string
  page = page.join(new_page)

  # Join the article and the title together
  article = title_name + " " + page

  return article, title_name

In [None]:
# Function to remove stopwords
def remove_stopwords(some_text):

  # Remove hyperlinks
  some_text = re.sub(r'http\S+|www\S+|https\S+', '', some_text)

  # Remove HTML tags
  some_text = BeautifulSoup(some_text, 'html.parser').get_text()

  # Remove anything that is not a word or white space"
  # some_text = re.sub(r'[^\w\s]', '', some_text)

  # Remove numbers from the text
  some_text = re.sub(r'[\d+]', '', some_text)

  #Tokenize
  doc = word_tokenize(some_text)
  lemmatizer = WordNetLemmatizer()
  lemmas = [lemmatizer.lemmatize(token) for token in doc]
  # Remove stopwords and non-alphabetic tokens
  a_lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in STOP_WORDS]
  res = ' '. join(a_lemmas)

  return res

In [None]:
# Function to calculate positivity score
def calculate_positivity_score(some_text):
  with open('/content/drive/MyDrive/Punch Project/Dictionary/positive-words.txt', 'r', encoding='latin1') as text_file:
    text = text_file.read()

    # Tokenize the text
    doc_pos = word_tokenize(text)

    # Remove words in the positive words list that are also in the stopwords list
    pos_list = [word for word in doc_pos if word not in STOP_WORDS]

  # Tokenize the text
  doc = word_tokenize(some_text)

  # Calculate the postitivity score
  positive_score = 0
  for word in doc:
    if word in pos_list:
      positive_score += 1

  return positive_score

In [None]:
# Function to calculate negativity score
def calculate_negativity_score(some_text):
  with open('/content/drive/MyDrive/Punch Project/Dictionary/negative-words.txt', 'r', encoding='latin1') as text_file:
    text = text_file.read()

    # Tokenize the text
    doc_neg = word_tokenize(text)

    # Remove words in the negative words list that are also in the stopwords list
    neg_list = [word for word in doc_neg if word not in STOP_WORDS]

  # Tokenize the text
  doc = word_tokenize(some_text)

  # Calculate the negativity score
  negative_score = 0
  for word in doc:
    if word in neg_list:
      negative_score += 1

  return negative_score

In [None]:
# Function to calculate subjectivity score
def calculate_polarity_score(positive_score, negative_score):
  polarity_score = (positive_score - negative_score)/((positive_score + negative_score) + 0.000001)

  return polarity_score

In [None]:
# Function to calculate word count
def calculate_word_count(some_text):
  doc = word_tokenize(some_text)
  word_count = len(doc)

  return word_count

In [None]:
# Function to calculate subjectivity score
def calculate_subjectivity_score(some_text, positive_score, negative_score):
  word_count = calculate_word_count(some_text)

  subjectivity_score = (positive_score + negative_score)/(word_count + 0.000001)

  return subjectivity_score

In [None]:
# Function to calculate average number of words per sentence
def calculate_avg_sentence_len(some_text, article):

  # Divide the article into a list of sentences
  no_sentence = sent_tokenize(article)

  word_count = calculate_word_count(some_text)

  # Average sentence length
  avg_sentence_len = word_count/len(no_sentence)

  return avg_sentence_len

In [None]:
# Function to calculate complex word count
def calculate_complex_word(some_text):
  # Instantiate the syllable tokenizer
  SSP = SyllableTokenizer()

  # A list of words with more than 2 syllables in our text
  complex_list = [token for token in word_tokenize(some_text) if len(SSP.tokenize(token)) > 2]

  no_complex_words = len(complex_list)
  return no_complex_words

In [None]:
# Function to calculate percentage of complex words
def calculate_per_complex_words(no_complex_words, no_words):
  # Percentage of complex words
  per_complex_words = no_complex_words/no_words

  return per_complex_words

In [None]:
# Function to calculate Fog Index
def calculate_fog_index(some_text, avg_sentence_len, no_complex_words):

  no_words = calculate_word_count(some_text)

  # Percentage of complex words
  per_complex_words = calculate_per_complex_words(no_complex_words, no_words)

  # Calculate fog index
  fog_index = 0.4 * (avg_sentence_len + per_complex_words)

  return fog_index

In [None]:
# Function to calculate syllable count per word
def calculate_no_syllable_per_word(some_text):
  # Instantiate our syllable Tokenizer
  SSP = SyllableTokenizer()

  # Load your Spacy tokenizer
  nlp = spacy.load("en_core_web_sm")

  some_text_token = nlp(some_text)

  # Remove all the "es" and "ed" syllables using lemmatization
  lemmas = [token.lemma_ for token in some_text_token]

  # Create a list of the syllables for each word
  syllable_list = [SSP.tokenize(token) for token in lemmas]

  no_syllable = 0
  for word in syllable_list:
    # Calculate the number of syllables for each word and add them up
    no_syllable += len(word)

  token_list = [token.text for token in some_text_token]

  syllable_per_word = no_syllable/len(token_list)

  return syllable_per_word

In [None]:
# Function to extract personal pronouns
def calculate_personal_pron(some_text):

  # Find the following words: I, we, my, ours. re.I ensures that you search for those words case INsensitive while (?-i:us) ensures that us is case sensitive
  pronounRegex = re.compile(r'(I|we|my|ours|(?-i:us))',re.I)

  pronouns = pronounRegex.findall(some_text)

  personal_pron_count = len(pronouns)

  return personal_pron_count

In [None]:
# Function to calculate average word length
def calculate_avg_word_len(some_text, word_count):
  total_char = len(re.sub('\s+', "", some_text))

  avg_word_len = total_char/word_count

  return avg_word_len

## Putting it all together

In [None]:
input_file = '/content/drive/MyDrive/Punch Project/The Punch.xlsx'
output_file = '/content/drive/MyDrive/Punch Project/The Punch Output File.xlsx'
output_file_1 = '/content/drive/MyDrive/Punch Project/The Punch Cleaned File.xlsx'

# Select the columns in the output excel file
df = pd.read_excel(output_file)
df_1 = pd.read_excel(output_file_1)
columns = list(df.columns)
columns_1 = list(df_1.columns)

# Create an empty DataFrame for the output
output_df = pd.DataFrame(columns=columns)
output_df_cleaned = pd.DataFrame(columns = columns_1)


# Iterate through each row of the input DataFrame
for index, row in df.iterrows():
  url = row['URL']

  print(url)

  article, title_name = scrape_data(url)  # Scrape data from the URL

  if article:
    data = remove_stopwords(article) # Remove stopwords from the list

    # A dataframe containing the URL, cleaned data and the tag
    output_df_cleaned.loc[index] = [df_1['URL'][index], title_name, data, df_1['TAGS'][index]]

    positive_score = calculate_positivity_score(data) # Calculate the positive score
    negative_score = calculate_negativity_score(data) # Calculate the negative score
    polarity_score = calculate_polarity_score(positive_score, negative_score) # Calculate the polarity score
    word_count = calculate_word_count(data) # Calculate the word count
    subjectivity_score = calculate_subjectivity_score(data, positive_score, negative_score) # Calculate the subjectivity score
    avg_sentence_len = calculate_avg_sentence_len(data, article) # Calculate the average number of words per sentence
    no_complex_words = calculate_complex_word(data) # Calculate the number of complex words
    per_complex_words = calculate_per_complex_words(no_complex_words, word_count) # Calculate the percentage of complex words
    fog_index = calculate_fog_index(data, avg_sentence_len, no_complex_words) # Calculate the fog index
    syllable_per_word = calculate_no_syllable_per_word(data) # Calculate the syllable per word
    personal_pron_count = calculate_personal_pron(data) # Calculate the number of personal pronouns
    avg_word_len = calculate_avg_word_len(data, word_count) # Calculate the average word length

    output_df.loc[index] = [df['DATE PUBLISHED'][index], df['URL'][index], title_name, df['TAGS'][index], positive_score, negative_score, polarity_score, subjectivity_score,
                            avg_sentence_len, per_complex_words, fog_index, no_complex_words, word_count,
                            syllable_per_word, personal_pron_count, avg_word_len]
  else:
    output_df.loc[index] = [df['DATE PUBLISHED'][index], df['URL'][index], title_name, df['TAGS'][index], None, None, None, None,
                            None, None, None, None, None, None, None, None]

    output_df_cleaned.loc[index] = [df_1['URL'][index], None, df_1['TAGS'][index]]

    continue

# Save the output DataFrame to an Excel file
output_file = '/content/drive/MyDrive/Punch Project/The Punch Output File.xlsx'
output_df.to_excel(output_file, index = False)

output_file_1 = '/content/drive/MyDrive/Punch Project/The Punch Cleaned File.xlsx'
output_df_cleaned.to_excel(output_file_1, index = False)

https://punchng.com/presidential-inauguration-obi-didnt-call-for-boycott-postponement-lp/
https://punchng.com/lets-reclaim-pdp-lost-glory-atiku-tasks-party-members/
https://punchng.com/tinubull-be-fair-to-all-ex-lawmaker/
https://punchng.com/just-in-atiku-obaseki-attend-pdps-reception-for-new-returning-governors/
https://punchng.com/im-the-best-candidate-for-senate-president-osita-izunaso-insists/
https://punchng.com/lp-faction-wants-tinubu-sworn-in-says-obi-asking-for-impossible/
https://punchng.com/why-fg-approved-contracts-18-days-to-handover-minister/
https://punchng.com/tinubu-celebrates-with-fasoranti-at-97/
https://punchng.com/10th-assembly-wase-betara-other-aspirants-protest-zoning/
https://punchng.com/oyo-pdp-mourns-adagunodo/
https://punchng.com/obi-lp-back-atiku-on-request-for-live-broadcast/
https://punchng.com/akeredolu-knocks-apc-over-nass-leadership-zoning-formula/
https://punchng.com/adagunodo-nigeria-lost-a-patriot-atiku-mourns/
https://punchng.com/wase-betara-other-as



https://punchng.com/adeleke-four-other-governors-court-handed-victories/
https://punchng.com/house-speakership-apc-picked-me-based-on-competence-says-abbas/
https://punchng.com/ekiti-senators-others-behind-akpabio-oyebanji/
https://punchng.com/breaking-app-withdraws-petition-against-tinubu/
https://punchng.com/senate-presidency-north-central-lawmakers-reject-apc-consensus-list/
https://punchng.com/accept-adelekes-victory-oyetola-tells-osun-apc/
https://punchng.com/presidential-poll-tribunal-to-consider-pdp-lps-live-broadcast-request/
https://punchng.com/speaker-five-aggrieved-aspirants-team-up-against-abbas/
https://punchng.com/final-victory-buhari-tinubu-congratulate-adeleke-supporters-jubilate/
https://punchng.com/bvas-will-dash-hope-of-dishonest-politicians-atiku/
https://punchng.com/osun-verdict-buhari-urges-support-for-adeleke-led-govt/
https://punchng.com/abia-labour-party-lawmaker-backs-abass-as-hor-speaker/
https://punchng.com/presidential-poll-tribunal-shouldnt-be-intimidated-



https://punchng.com/obi-visits-soyinka-one-month-after-clash-with-supporters/
https://punchng.com/sylva-sole-financier-of-bayelsa-apc-party-chair/
https://punchng.com/senate-presidency-south-east-senators-reject-zoning-to-south-south-caution-tinubu/




https://punchng.com/national-assembly-leadership-uncertainty-as-aspirants-continue-campaigns-lobby/
https://punchng.com/i-will-no-longer-contest-elections-oni-ekiti-ex-governor/
https://punchng.com/post-election-crises-and-rising-instability-in-political-parties/
https://punchng.com/nigeria-must-prevent-infiltration-of-armed-sudanese-fighters-ex-immigration-acg/
https://punchng.com/pdp-should-concentrate-on-playing-opposition-well-aibom-rep-luke/
https://punchng.com/part-time-legislature-not-ideal-for-nigerias-presidential-system-rep-elect-ogene/
https://punchng.com/adeleke-denies-payment-of-half-salary-demotion-of-workers/
https://punchng.com/tinubus-cabinet-el-rufai-may-decline-chief-of-staff-role-eyes-private-sector/
https://punchng.com/apc-tinubu-favour-akpabio-as-senate-president-abbas-speaker/
https://punchng.com/tinubu-salutes-yaraduas-memory-on-13th-anniversary/
https://punchng.com/umahi-dumps-senate-presidency-ambition-backs-akpabio/
https://punchng.com/court-to-decide-on-righ



https://punchng.com/buhari-years-despite-much-rhetoric-insecurity-triumphs/
https://punchng.com/ninth-nass-an-indolent-weak-rubber-stamp/
https://punchng.com/stop-the-outrageous-governors-pensions/
https://punchng.com/10th-nass-leadership-merit-integrity-must-count/
https://punchng.com/sell-ajaokuta-outright-not-concession/
https://punchng.com/fg-must-recover-n4tn-amcon-debt/
https://punchng.com/baba-tambuwal-where-are-deborahs-killers/
https://punchng.com/ensuring-the-safety-of-foods-and-medicines/
https://punchng.com/tackling-widespread-hunger-in-nigeria/
https://punchng.com/too-many-buildings-are-collapsing/
https://punchng.com/senates-dangerous-dabbling-in-religion/
https://punchng.com/n23tn-loan-approval-caps-nass-fiscal-errors/
https://punchng.com/nigerians-be-warned-avoid-bleaching-creams/
https://punchng.com/the-buhari-years-how-the-anti-corruption-war-floundered/
https://punchng.com/aviation-sector-remains-a-messy-field/
https://punchng.com/swest-govs-drop-the-ball-on-herders-



https://punchng.com/buhari-lays-booby-traps-for-incoming-government/
https://punchng.com/rescheduled-census-should-be-better-planned/
https://punchng.com/psc-ig-should-stamp-out-resurgent-police-atrocities/
https://punchng.com/the-buhari-years-an-era-of-gross-economic-fiasco/
https://punchng.com/election-petitions-justice-please-not-technicalities/
https://punchng.com/rigorous-efforts-needed-to-eliminate-malaria/
https://punchng.com/el-rufais-auspicious-alert-on-bandits-terrorists/
https://punchng.com/national-stadium-mess-shows-fgs-poor-sport-acumen/




https://punchng.com/the-buhari-years-failed-promises-dashed-hopes/
https://punchng.com/adamawa-dramatises-nigerias-rowdy-elections/
https://punchng.com/reps-draconian-bill-on-doctors-dead-on-arrival/
https://punchng.com/soyinkas-reminder-on-restructuring-timely/
https://punchng.com/foreign-investment-inflow-is-not-by-chance/
https://punchng.com/nigeria-should-stop-tolerating-mob-justice/
https://punchng.com/transition-shun-last-minute-contracts-employment/
https://punchng.com/botched-n70tn-case-degrades-anti-corruption-war/




https://punchng.com/nass-halt-further-loan-approvals-for-buhari/
https://punchng.com/states-should-reverse-their-dreadful-economic-condition/
https://punchng.com/2023-census-overcoming-the-pitfalls-hurdles/
https://punchng.com/trumps-prosecution-a-check-on-impunity/
https://punchng.com/human-rights-under-attack-worldwide/
https://punchng.com/stop-the-ethnic-brickbats-now/
https://punchng.com/revisiting-the-faulty-repentant-terrorists-narrative/
https://punchng.com/emefieles-cbn-superintendence-so-damaging/
https://punchng.com/unravel-citizen-chizobas-death-in-ethiopian-prison/
https://punchng.com/fg-states-should-reverse-water-poverty/
https://punchng.com/womens-representation-in-politics-still-too-low/
https://punchng.com/nigeria-africa-should-eradicate-energy-poverty/
https://punchng.com/security-agencies-should-make-elections-safer/
https://punchng.com/federalism-constitution-amendment-bills-fall-short/
https://punchng.com/fg-should-tread-carefully-on-subsidy-policy/
https://punchng



https://punchng.com/2023-polls-stop-intimidation-of-opponents-in-lagos/
https://punchng.com/too-many-nigerians-are-missing/
https://punchng.com/naira-crisis-attacks-on-banks-barbaric-misplaced/
https://punchng.com/too-many-nigerian-women-children-are-dying/
https://punchng.com/justice-for-victims-of-anambra-police-horror-den/
https://punchng.com/agenda-2050-should-not-be-another-pipe-dream/
https://punchng.com/elections-again-buhari-inec-police-disappoint-nigerians/
https://punchng.com/making-nigeria-open-defecation-free/
https://punchng.com/nigerians-be-bold-go-out-and-vote/
https://punchng.com/2023-elections-for-inec-buhari-history-beckons/
https://punchng.com/fgs-social-schemes-have-faltered-review-them/
https://punchng.com/bandits-kill-toddler-seven-others-in-fresh-plateau-attack/
https://punchng.com/pdp-demands-transparency-as-akeredolu-extends-sick-leave/
https://punchng.com/dollar-video-submit-yourself-for-trial-group-tells-ganduje/
https://punchng.com/fayose-will-pay-for-workin



https://punchng.com/dont-tamper-with-nmdpra-seals-ipman-warns-kogi-marketers/
https://punchng.com/nurtw-south-west-zone-ready-to-reconcile-union-leader/
https://punchng.com/naccima-hails-tinubus-emergence-as-ecowas-chair/
https://punchng.com/naat-kicks-against-200-fee-hike-by-federal-universities/
https://punchng.com/community-pharmacists-advocate-ethical-drug-distribution-2/
https://punchng.com/adamu-nwc-members-on-collision-course-over-apc-account-audit/
https://punchng.com/yari-not-invited-for-ignoring-tinubus-phone-call-dss/
https://punchng.com/apc-shifts-caucus-nec-meetings-to-honour-tinubu/
https://punchng.com/presidency-i-worked-against-atiku-says-fayose/
https://punchng.com/court-orders-fg-to-account-for-5bn-recovered-abachas-loot/
https://punchng.com/uae-orders-arrest-of-man-filmed-buying-luxury-cars/
https://punchng.com/six-killed-in-china-kindergarten-attack/
https://punchng.com/psc-demotes-nine-orders-acp-to-refund-entitlements/
https://punchng.com/osinbajo-promoted-good-go



https://punchng.com/christian-group-urges-dss-to-quiz-politicians-making-inciting-comments/
https://punchng.com/anambra-community-bans-youth-activities-collection-of-levies/
https://punchng.com/missing-ex-asuu-president-nigerian-community-activates-contacts-combs-turkey/
https://punchng.com/ex-edo-pdp-gov-aspirant-ikhine-defects-to-apc/
https://punchng.com/yari-released-from-dss-custody/
https://punchng.com/breaking-killings-mutfwang-re-imposes-24-hour-curfew-in-plateau-lg/
https://punchng.com/dss-denies-arresting-yari-for-ignoring-tinubus-call/
https://punchng.com/hoodlums-vandalise-sdp-office-in-kogi/
https://punchng.com/gov-alia-condemns-killings-in-benue-communities/
https://punchng.com/18-bus-passengers-die-in-lagos-accident/
https://punchng.com/dessers-ready-to-replace-morelos/
https://punchng.com/2026-wcup-qualifiers-eagles-may-draw-ghana-safrica-angola/
https://punchng.com/only-psg-can-afford-osimhen-de-laurentiis/
https://punchng.com/afrocan-dtigers-eye-semi-final-spot/
https:



https://punchng.com/awoniyi-dreams-afcon-glory/
https://punchng.com/besiktas-fans-demand-okereke-signing/




https://punchng.com/naija-super-8-akwa-reach-semis-after-rivers-win/
https://punchng.com/bfn-set-for-lagos-classics/
https://punchng.com/ilechukwu-set-for-rangers-challenge/
https://punchng.com/mbaoma-snubs-remo-for-rwandan-club/




https://punchng.com/nigeria-wins-u-21-womens-volleyball-championship/
https://punchng.com/chukwuezes-milan-move-in-jeopardy/
https://punchng.com/rangers-legend-hails-coach-for-signing-dessers/
https://punchng.com/naija-super-8-noble-deny-sporting-win-remo-held/
https://punchng.com/dtigers-beat-mali-62-56-in-afrocan-opener/




https://punchng.com/ardova-handball-correction-boys-shock-safety-shooters/
https://punchng.com/onyema-honours-boycotted-montreal-olympics-athletes-afcon-heroes/
https://punchng.com/procurement-for-odi-erosion-control-begins-soon-says-nddc/
https://punchng.com/psg-keen-on-osimhen-as-mbappe-replacement/




https://punchng.com/dessers-perfect-striker-rohr/
https://punchng.com/napoli-grant-osimhen-extended-holiday/
https://punchng.com/augsburg-stuttgart-track-onuachu/
https://punchng.com/adeleye-relishes-onana-comparision/
https://punchng.com/forest-open-talks-with-iheanacho/
https://punchng.com/naija-super8-sochima-hands-lobi-first-win/
https://punchng.com/daga-confirms-euro-offers-prefers-liverpool-move/
https://punchng.com/speaker-lauds-katsina-united-fc-for-npfl-promotion/
https://punchng.com/england-defeat-spain-to-win-under-21-euro/
https://punchng.com/david-de-gea-announces-exit-from-man-utd/
https://punchng.com/match-bonus-falcons-have-no-plan-to-boycott-world-cup-game-says-onome-ebi/
https://punchng.com/asisat-oshoala-shortlisted-for-2023-ballon-dor-award/
https://punchng.com/account-for-falcons-960k-from-fifa-waldrum-tells-nff/
https://punchng.com/osimhen-eagles-only-world-class-player-alex-bio/
https://punchng.com/joshua-risks-n45bn-against-whyte/
https://punchng.com/balogun-lur



https://punchng.com/whos-moved-and-whos-not/




https://punchng.com/remo-beat-enyimba-in-naija-super-8-opener/
https://punchng.com/ex-man-united-goalie-edwin-van-der-sar-suffers-brain-hemorrhage/
https://punchng.com/uefa-clears-milan-aston-villa-brighton-to-play-in-europe/
https://punchng.com/osimhen-demands-e7m-to-sign-new-napoli-deal/
https://punchng.com/falcons-plan-protest-world-cup-boycott-as-nff-cancels-bonuses/
https://punchng.com/family-announces-burial-rites-of-late-nigerian-boxer-okorodudu/
https://punchng.com/doctor-told-me-ill-die-if-i-played-football-ogunmodede/
https://punchng.com/joshua-whyte-rematch-for-august-12/
https://punchng.com/villarreal-doubt-milan-interest-in-chukwueze/




https://punchng.com/iwobi-set-for-everton-stay/
https://punchng.com/nwaneri-snubs-city-chelsea-for-arsenal-stay/
https://punchng.com/falcons-nest-at-mercure-hotel-in-australia/
https://punchng.com/enyimba-remo-clash-as-naija-super-8-begins/
https://punchng.com/mbaoma-fires-katsina-to-npfl/
https://punchng.com/handball-tojemarine-triumph/
https://punchng.com/quadri-drops-mati-omotayo-move-up-in-ittf-rankings/
https://punchng.com/womens-wcup-21-falcons-begin-camping-in-australia/
https://punchng.com/akpom-missing-as-boro-begin-pre-season/
https://punchng.com/national-trials-amusan-wins-again-itsekiri-100m-king/
https://punchng.com/npfl-lawal-applauds-eunisell-boot-initiative/
https://punchng.com/mbappe-must-sign-new-contract-to-stay-at-psg-club-president/
https://punchng.com/psg-appoint-luis-enrique-as-new-coach/
https://punchng.com/barcelona-sign-martinez-from-bilbao/
https://punchng.com/ancelotti-to-coach-brazil-from-2024-report/
https://punchng.com/brighton-keen-on-bassey-kudus/
https



https://punchng.com/ptad-pays-n754bn-to-pensioners-in-seven-years-says-official/
https://punchng.com/uae-invests-225-1m-in-nigeria-as-visa-ban-persists/
https://punchng.com/customers-worry-over-deposit-safety-despite-bank-fraud-decline/
https://punchng.com/access-bank-completes-acquisition-of-angolans-finibanco/
https://punchng.com/concerns-mount-over-rising-clashes-at-airports/
https://punchng.com/ibedc-to-upgrade-metering-system/
https://punchng.com/leadway-assurance-to-implement-new-reporting-policy/
https://punchng.com/power-sector-gas-debt-hits-1bn-says-nga/
https://punchng.com/57-96-million-bvn-issued-as-cbn-threatens-accounts-closure/
https://punchng.com/court-nullifies-standard-alliances-liquidation/
https://punchng.com/senate-probes-development-bank-over-n483bn-loan/
https://punchng.com/otudeko-writes-fbn-over-ecobanks-petition/
https://punchng.com/more-engineers-will-leave-nigeria-don-warns/
https://punchng.com/new-icsan-president-to-promote-corporate-governance/
https://punc



https://healthwise.punchng.com/bedwetting-may-be-sign-of-depression-in-children-physicians-say/
https://healthwise.punchng.com/stronger-political-will-clear-commitment-needed-in-fight-against-malaria-says-researcher/
https://healthwise.punchng.com/nafdac-partners-pre-shipment-inspection-agents-to-strengthen-food-exports/
https://healthwise.punchng.com/community-pharmacists-advocate-ethical-drug-distribution-to-enhance-medicine-security/
https://healthwise.punchng.com/female-genital-mutilation-grave-violation-of-human-rights-unfpa/
https://healthwise.punchng.com/obese-children-risk-type-2-diabetes-may-suffer-same-complications-as-adults-endocrinologist/
https://healthwise.punchng.com/mercury-exposure-hampers-foetus-children-development-experts/




https://healthwise.punchng.com/insufficient-oncologists-equipment-affecting-cancer-treatment-says-ncs/
https://healthwise.punchng.com/political-will-required-to-immunise-67-million-children-missed-during-covid-19-pandemic-unicef/
https://healthwise.punchng.com/my-perseverance-helped-me-to-overcome-all-odds-physically-challenged-graduate/
https://healthwise.punchng.com/mdcn-wrong-to-downgrade-doctors-certificates-nard-president/




https://healthwise.punchng.com/nurses-lament-bayelsas-refusal-to-pay-new-hazard-allowance/
https://healthwise.punchng.com/deadly-anthrax-disease-outbreak-looms-over-poor-awareness/
https://healthwise.punchng.com/why-nigeria-was-excluded-from-malaria-vaccine-allocation-says-gavi/
https://healthwise.punchng.com/inhaling-nitrous-oxide-to-get-high-may-cause-brain-damage-mental-illnesses-psychiatrists/
https://healthwise.punchng.com/rising-antenatal-care-at-tbas-fuels-child-hiv-cases-gynaecologists/
https://healthwise.punchng.com/why-sweating-excessively-at-night-should-raise-serious-health-concerns-physicians/
https://healthwise.punchng.com/world-not-on-track-to-achieve-universal-access-to-water-sanitation-un-agencies/




https://healthwise.punchng.com/exercising-without-sufficient-sleep-may-cause-cognitive-decline-study/
https://healthwise.punchng.com/call-off-strike-to-save-lives-nasarawa-govt-begs-striking-doctors/
https://healthwise.punchng.com/sanwo-olu-restates-commitment-to-safe-water/
https://healthwise.punchng.com/public-health-expert-tells-fg-to-allocate-70-healthcare-budget-to-phcs/
https://healthwise.punchng.com/working-beyond-40-hours-weekly-may-cause-occupational-hazards-public-health-expert-warns/
https://healthwise.punchng.com/excessive-alcohol-intake-overburdens-liver-reduces-libido-nutritionist-warns/
https://healthwise.punchng.com/nigeria-records-798-diphtheria-cases-80-deaths-in-33-lgas/
https://healthwise.punchng.com/lagos-needs-efficient-regulatory-framework-for-hygiene-promotion-sanwo-olu/
https://healthwise.punchng.com/lessons-from-covid-19-should-stimulate-fg-stakeholders-to-fund-niprd-dg/
https://healthwise.punchng.com/nutrition-experts-task-govt-on-food-fortification-enforceme

### DATA ANALYSIS

In [None]:
data = pd.read_excel('/content/drive/MyDrive/Punch Project/The Punch Output File.xlsx')
data.head()

Unnamed: 0,DATE PUBLISHED,URL,TITLE NAME,TAGS,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,2023-05-11,https://punchng.com/presidential-inauguration-...,Presidential inauguration: Obi didn’t call fo...,Politics,7,22,-0.517241,0.087613,17.421053,0.371601,7.117062,123,331,2.154079,208,6.151057
1,2023-05-11,https://punchng.com/lets-reclaim-pdp-lost-glor...,"Let’s reclaim PDP lost glory, Atiku tasks par...",Politics,21,14,0.2,0.087719,14.25,0.393484,5.857393,157,399,2.265664,204,6.463659
2,2023-05-11,https://punchng.com/tinubull-be-fair-to-all-ex...,Tinubu’ll be fair to all – Ex-lawmaker,Politics,9,10,-0.052632,0.085202,9.291667,0.430493,3.888864,96,223,2.457399,126,6.443946
3,2023-05-11,https://punchng.com/just-in-atiku-obaseki-atte...,"JUST IN: Atiku, Obaseki attend PDP’s receptio...",Politics,1,0,0.999999,0.01087,23.0,0.369565,9.347826,34,92,2.217391,43,6.445652
4,2023-05-11,https://punchng.com/im-the-best-candidate-for-...,"I’m the best candidate for Senate President, ...",Politics,52,9,0.704918,0.123984,21.391304,0.46748,8.743514,230,492,2.361789,293,6.678862


AVERAGE FOG INDEX

In [None]:
average_fog_index = data['FOG INDEX'].mean()
print(average_fog_index)

6.325771756892892


AVERAGE SUBJECTIVITY OF THE ARTICLES

In [None]:
average_subjectivity = data['SUBJECTIVITY SCORE'].mean()
print(average_subjectivity)

0.08911143353352624


WHICH CATEGORY HAS THE MOST POSITIVE EMOTION?

In [None]:
tags_df = data.groupby('TAGS').mean()

tags_df.sort_values(by='POLARITY SCORE', ascending= False)['POLARITY SCORE']

  tags_df = data.groupby('TAGS').mean()


TAGS
Sport             0.369845
Business          0.365503
Politics          0.157121
News             -0.094879
General Health   -0.121839
Editorial        -0.363592
Metro Plus       -0.485386
Name: POLARITY SCORE, dtype: float64

WHICH CATEGORY HAS THE LOWEST NUMBER OF WORDS?

In [None]:
tags_df.sort_values(by='WORD COUNT')['WORD COUNT']

TAGS
Metro Plus        204.180723
Sport             206.812500
News              209.575000
Business          230.762500
Politics          348.620000
General Health    379.187500
Editorial         666.268293
Name: WORD COUNT, dtype: float64

## REFERENCES
The negative and positive text files were sourced from:
https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon

Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews."
;       Proceedings of the ACM SIGKDD International Conference on Knowledge
;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle,
;       Washington, USA,
