# Data Extraction

In [179]:
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import requests
from pathlib import Path

In [180]:
print(f"pandas: {pd.__version__}")
print(f"bs4: {bs4.__version__}")
print(f"requests: {requests.__version__}")

pandas: 1.5.3
bs4: 4.11.2
requests: 2.31.0


In [182]:
# define path
input_file_path = Path("/content/Input.xlsx")
output_file_path = Path("/content/Output Data Structure.xlsx")

# define pandas dataframe
input_df = pd.read_excel(io=input_file_path)
output_df = pd.read_excel(io=output_file_path)

In [183]:
input_df

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...
...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...
111,51844.6,https://insights.blackcoffer.com/what-are-the-...
112,52306.4,https://insights.blackcoffer.com/marketing-dri...


In [184]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,,,,,,,,,,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,,,,,,,,,,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...,,,,,,,,,,,,,
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...,,,,,,,,,,,,,
111,51844.6,https://insights.blackcoffer.com/what-are-the-...,,,,,,,,,,,,,
112,52306.4,https://insights.blackcoffer.com/marketing-dri...,,,,,,,,,,,,,


In [185]:
# Create a directory to save the extracted articles
output_dir = Path("extracted_articles")
output_dir.mkdir(parents=True, exist_ok=True)

In [186]:
for index, row in output_df.iterrows():
  url = row['URL']
  url_id = row['URL_ID']
  print(url_id, url)
  break

123.0 https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/


In [187]:
response = requests.get("https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/")
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find("h1","tdb-title-text").text
print(title)

article_text = "\n".join([p.text for p in soup.find_all("div",{"class": "tdb-block-inner td-fix-index"})])
print(article_text)

Rise of e-health and its impact on humans by the year 2030



Monday, October 16, 2023
Sign in / Join

Search
HomeWhat We ThinkRise of e-health and its impact on humans by the year 2030
Rise of e-health and its impact on humans by the year 2030
By Ajay Bidyarthy
January 2, 2023
0
4146





2020 was the year the world was ravaged by the SarsCov2 virus. This notorious virus brought about a pandemic that would go on to change the course of humanity.  From that point forth daily lives of everyone across the world changed. With widespread stringent lockdowns, the entire world came to a sharp halt. Not only was the general populace affected, but the pandemic also affected all industries. The pandemic did not even spare critical industries, like healthcare and security. While these industries were required to function for the benefit of society, their daily operations changed drastically. But just as human nature prevails, we rose from this adversity. Post pandemic era saw the rise of new tec

In [191]:
# create a file_name column, will help to populate the output_df
output_df['file_name'] = output_df['URL_ID'].apply(lambda x: str(float(x)) + '.txt')
output_df[['URL_ID','file_name']]

Unnamed: 0,URL_ID,file_name
0,123.0,123.0.txt
1,321.0,321.0.txt
2,2345.0,2345.0.txt
3,4321.0,4321.0.txt
4,432.0,432.0.txt
...,...,...
109,50921.0,50921.0.txt
110,51382.8,51382.8.txt
111,51844.6,51844.6.txt
112,52306.4,52306.4.txt


In [192]:
def extract_and_save_article(df:pd.DataFrame, title_tag:str, text_tag:str, output_dir: Path):
  processed_url_ids = []
  # Iterate through each row in the input dataframe
  for index, row in df.iterrows():
      # Get the URL and URL_ID from the dataframe
      url = row['URL']
      url_id = row['URL_ID']

      # Check if the file with the same URL_ID already exists
      file_name = f"{url_id}.txt"
      output_file = output_dir / file_name
      if output_file.exists():
          print(f"Skipped as the file: '{file_name}' already exists at {output_file}")
          continue

      # Send an HTTP GET request to fetch the webpage content
      response = requests.get(url)

      if response.status_code == 200:
          # Parse the HTML content using BeautifulSoup
          soup = BeautifulSoup(response.text, 'html.parser')

          # Extract the article title and text
          title_element = soup.find('h1', title_tag)
          if title_element:
            title = title_element.text
          else:
            title = None
            print("Title not found, hence skipped", url_id, url)
            continue

          text_elements = soup.find_all(class_= text_tag)
          if text_elements:
            article_text = "\n".join([p.text for p in text_elements])
          else:
            article_text = None
            print("Text not found, hence skipped", url_id, url)

          if (title is not None) and (article_text is not None):
            # Save the extracted article to a text file with URL_ID as its name
            with output_file.open('w', encoding='utf-8') as file:
                file.write(title + "\n\n" + article_text)
                # To track which url_ids have been used to create the file
                processed_url_ids.append(url)


            print(f"Article from URL_ID {url_id} extracted and saved to {output_file}")
      else:
        # Save the extracted article to a text file with URL_ID as its name
        with output_file.open('w', encoding='utf-8') as file:
            file.write("NA" + "\n\n" + "NA")
            # To track which url_ids have been used to create the file
            processed_url_ids.append(url)
        print(f"Failed to fetch the URL for URL_ID {url_id}")

  print("Extraction completed.")
  return processed_url_ids

In [None]:
processed_url_ids1 = extract_and_save_article(
    df=input_df,
    title_tag="entry-title",
    text_tag="td-post-content tagdiv-type",
    output_dir = output_dir
    )

Article from URL_ID 123.0 extracted and saved to extracted_articles/123.0.txt
Article from URL_ID 321.0 extracted and saved to extracted_articles/321.0.txt
Title not found, hence skipped 2345.0 https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/
Article from URL_ID 4321.0 extracted and saved to extracted_articles/4321.0.txt
Article from URL_ID 432.0 extracted and saved to extracted_articles/432.0.txt
Article from URL_ID 2893.8 extracted and saved to extracted_articles/2893.8.txt
Article from URL_ID 3355.6 extracted and saved to extracted_articles/3355.6.txt
Article from URL_ID 3817.4 extracted and saved to extracted_articles/3817.4.txt
Title not found, hence skipped 4279.2 https://insights.blackcoffer.com/how-advertisement-increase-your-market-value/
Article from URL_ID 4741.0 extracted and saved to extracted_articles/4741.0.txt
Article from URL_ID 5202.8 extracted and saved to extracted_articles/5202.8.txt
Article from URL_ID 5664.6 extracted

**NOTE:** The message "Title or Text not found", is most probably due to the changes in the structure of the HTML Code. So, in that case we have to manually visit those site and get the new tags.

In [None]:
processed_url_ids2 = extract_and_save_article(
    df=input_df,
    title_tag="tdb-title-text",
    text_tag="tdb-block-inner td-fix-index",
    output_dir = output_dir
    )

Skipped as the file: '123.0.txt' already exists at extracted_articles/123.0.txt
Skipped as the file: '321.0.txt' already exists at extracted_articles/321.0.txt
Article from URL_ID 2345.0 extracted and saved to extracted_articles/2345.0.txt
Skipped as the file: '4321.0.txt' already exists at extracted_articles/4321.0.txt
Skipped as the file: '432.0.txt' already exists at extracted_articles/432.0.txt
Skipped as the file: '2893.8.txt' already exists at extracted_articles/2893.8.txt
Skipped as the file: '3355.6.txt' already exists at extracted_articles/3355.6.txt
Skipped as the file: '3817.4.txt' already exists at extracted_articles/3817.4.txt
Article from URL_ID 4279.2 extracted and saved to extracted_articles/4279.2.txt
Skipped as the file: '4741.0.txt' already exists at extracted_articles/4741.0.txt
Skipped as the file: '5202.8.txt' already exists at extracted_articles/5202.8.txt
Skipped as the file: '5664.6.txt' already exists at extracted_articles/5664.6.txt
Skipped as the file: '6126

In [214]:
extracted_articles = list(output_dir.glob("*"))
print(len(extracted_articles))

114


In [215]:
extracted_articles[:5]

[PosixPath('extracted_articles/40761.4.txt'),
 PosixPath('extracted_articles/43994.0.txt'),
 PosixPath('extracted_articles/21827.6.txt'),
 PosixPath('extracted_articles/35681.6.txt'),
 PosixPath('extracted_articles/23213.0.txt')]

In [211]:
output_df['URL_ID'].nunique()

114

**NOTE:** Although we have 114 url_ids yet we are getting 112, so 2 url_ids have failed to fetch, let's look at them.

In [212]:
net_processed_url_ids = set(processed_url_ids1 + processed_url_ids2)
len(net_processed_url_ids)

114

In [213]:
for index, row in output_df.iterrows():
  url = row["URL"]
  if url not in net_processed_url_ids:
    print(url)

**NOTE:** After manually checking those file it's evident the page doesn't exist, so I will impute them with "NA".

In [216]:
for article in extracted_articles:
  with open(file = article, mode = 'r') as f:
      # Read the entire file content
      article = f.read()

      # Do something with the file content
      print(article)
      break

How COVID-19 is impacting payment preferences?




Sunday, October 15, 2023
Sign in / Join

Search
HomeData ScienceBig DataHow COVID-19 is impacting payment preferences?
How COVID-19 is impacting payment preferences?
By Ajay Bidyarthy
June 22, 2020
0
11943





I would rather pay cash – Before COVID-19.
I would rather make online payment – After lockdown.
During this lockdown, one can observe a number of small positive changes in our surroundings. One such positive change is using online mode of payment even if they are small in amount as it counts as a big step towards DIGITAL INDIA and self-development as well.
According to Economic Times, 42% of Indians say that they have started using online mode of payment. Some small tasks like mobile phone recharge, bill payments, buying groceries, etc., are some essential tasks that cannot be ignored, and making an online payment is way too convenient for them. Also, multiple schemes have been initiated by the government to promote online payme

# Text Analysis

In [217]:
master_dictionary_path = Path("/content/MasterDictionary")
stopwords_path = Path("/content/StopWords")

master_dictionary_files = list(master_dictionary_path.glob("*"))
stopwords_files = list(stopwords_path.glob("*"))

In [218]:
print(f"{len(master_dictionary_files)}: {master_dictionary_files}")
print(f"{len(stopwords_files)}, {stopwords_files}")

2: [PosixPath('/content/MasterDictionary/negative-words.txt'), PosixPath('/content/MasterDictionary/positive-words.txt')]
7, [PosixPath('/content/StopWords/StopWords_GenericLong.txt'), PosixPath('/content/StopWords/StopWords_Auditor.txt'), PosixPath('/content/StopWords/StopWords_Names.txt'), PosixPath('/content/StopWords/StopWords_Geographic.txt'), PosixPath('/content/StopWords/StopWords_Generic.txt'), PosixPath('/content/StopWords/StopWords_Currencies.txt'), PosixPath('/content/StopWords/StopWords_DatesandNumbers.txt')]


## Load Positive and Negative Words

In [219]:
positive_words_file = master_dictionary_files[1]
negative_words_file = master_dictionary_files[0]

In [220]:
print(f"positive_words_file: {positive_words_file}")
print(f"negative_words_file: {negative_words_file}")

positive_words_file: /content/MasterDictionary/positive-words.txt
negative_words_file: /content/MasterDictionary/negative-words.txt


In [221]:
# Load the custom stop words files
custom_stopwords = set()
for file in stopwords_files:
    with open(file = file, mode = "r", encoding = 'utf-8', errors = 'ignore') as f:
        custom_stop_words.update(f.read().splitlines())

custom_stopwords = set( word.lower() for word in custom_stopwords)

In [222]:
print(f"{len(custom_stopwords)}: {custom_stopwords}")

0: set()


In [223]:
# Initialize sets for positive and negative words
positive_words = set()
negative_words = set()

# Load positive words
with open(positive_words_file, "r") as f:
  positive_words.update(f.read().splitlines())

# Load negative words
with open(negative_words_file, "r", encoding='utf-8', errors='ignore') as f:
  negative_words.update(f.read().splitlines())

positive_words = set(word.lower() for word in positive_words)
negative_words = set(word.lower() for word in negative_words)

In [224]:
print(f"positive_words: {len(positive_words)} {positive_words}")
print(f"negative_words: {len(negative_words)} {negative_words}")

positive_words: 2006 {'solicitously', 'inviolate', 'flashy', 'luminous', 'flexibility', 'permissible', 'succes', 'abundant', 'credence', 'mercifully', 'realizable', 'blockbuster', 'convenience', 'unwavering', 'revive', 'patient', 'creative', 'unrivaled', 'idol', 'prominent', 'simplest', 'improvement', 'excitedly', 'futurestic', 'effusive', 'perfection', 'diligence', 'impartial', 'easiest', 'compatible', 'stylish', 'best-performing', 'beckon', 'dynamic', 'rapture', 'resounding', 'acclaim', 'comprehensive', 'exceptional', 'encouragement', 'renown', 'brainy', 'ambitious', 'innovation', 'solicitous', 'sustainable', 'clean', 'unbound', 'proactive', 'unequivocally', 'spotless', 'vouch', 'beautify', 'good', 'beneficiary', 'boundless', 'preeminent', 'envious', 'trusting', 'enjoyable', 'rejuvenated', 'reasonably', 'guiltless', 'responsibly', 'amiability', 'maturity', 'promise', 'spontaneous', 'cleaner', 'improved', 'overtaken', 'peerless', 'rockstar', 'feature-rich', 'fearless', 'magnanimous', 

## Perform Sentiment Analysis

In [225]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [229]:
# Tokenize the text
tokens = word_tokenize(article)
tokens = [token.lower() for token in tokens]

In [227]:
print(f"{len(tokens)}: {tokens}")

756: ['how', 'covid-19', 'is', 'impacting', 'payment', 'preferences', '?', 'sunday', ',', 'october', '15', ',', '2023', 'sign', 'in', '/', 'join', 'search', 'homedata', 'sciencebig', 'datahow', 'covid-19', 'is', 'impacting', 'payment', 'preferences', '?', 'how', 'covid-19', 'is', 'impacting', 'payment', 'preferences', '?', 'by', 'ajay', 'bidyarthy', 'june', '22', ',', '2020', '0', '11943', 'i', 'would', 'rather', 'pay', 'cash', '–', 'before', 'covid-19', '.', 'i', 'would', 'rather', 'make', 'online', 'payment', '–', 'after', 'lockdown', '.', 'during', 'this', 'lockdown', ',', 'one', 'can', 'observe', 'a', 'number', 'of', 'small', 'positive', 'changes', 'in', 'our', 'surroundings', '.', 'one', 'such', 'positive', 'change', 'is', 'using', 'online', 'mode', 'of', 'payment', 'even', 'if', 'they', 'are', 'small', 'in', 'amount', 'as', 'it', 'counts', 'as', 'a', 'big', 'step', 'towards', 'digital', 'india', 'and', 'self-development', 'as', 'well', '.', 'according', 'to', 'economic', 'times',

In [228]:
english_stop_words = set(stopwords.words('english'))
english_stop_words = set(word.lower() for word in english_stop_words)

In [None]:
print(f"{len(english_stop_words)}: {english_stop_words}")

179: {'for', 'am', 'doesn', "needn't", 'can', 'by', 'there', "aren't", 'some', 'isn', 'because', 'aren', 'is', "haven't", "couldn't", 'any', 'doing', 'more', "you've", 'her', 'further', "that'll", 'him', 'such', "should've", "hasn't", 'those', 'at', "you're", 'nor', 'too', 'be', 'did', 'then', 'hasn', 'has', 'during', 'we', 't', 'was', 'being', 'so', 'won', 's', 'do', 'i', 'out', "weren't", "wouldn't", 'after', 'you', 'once', 'yourselves', 'why', "didn't", 'that', 'when', 'not', 'above', 're', 'haven', 'weren', 'theirs', 'itself', 'over', 'herself', 'now', 'hadn', 'shouldn', 'as', 'in', 'them', "hadn't", 'your', 'an', 'all', 'few', 'these', 'y', 'a', 'down', 'below', 'have', 'been', 'hers', 'how', 'this', 'between', 'shan', 'against', 'o', 'until', 'both', 'into', 'she', 'ours', 'where', 'don', "doesn't", 'off', 'with', 'on', "she's", 've', 'ain', 'should', 'ourselves', 'just', "shan't", 'through', 'who', "won't", 'their', 'm', 'very', 'does', "mustn't", 'me', 'here', 'its', 'mightn', 

In [230]:
import string

# Define a set of punctuation characters to remove
punctuation = set(string.punctuation)
punctuation = set(word.lower() for word in punctuation)

In [231]:
print(f"{len(punctuation)}: {punctuation}")

32: {'~', '&', '@', '>', '?', '\\', ':', '!', '<', '|', ')', '^', '_', '/', '$', ']', ',', '=', '+', '(', '{', '`', '#', '"', '-', "'", '}', '.', '*', '[', '%', ';'}


In [232]:
# Clean the tokens by removing stop words
filtered_tokens = list()
for word in tokens:
  if (word not in english_stop_words) and (word not in custom_stopwords) and (word not in punctuation):
    filtered_tokens.append(word)

In [233]:
print(f"{len(filtered_tokens)}: {filtered_tokens}")

408: ['covid-19', 'impacting', 'payment', 'preferences', 'sunday', 'october', '15', '2023', 'sign', 'join', 'search', 'homedata', 'sciencebig', 'datahow', 'covid-19', 'impacting', 'payment', 'preferences', 'covid-19', 'impacting', 'payment', 'preferences', 'ajay', 'bidyarthy', 'june', '22', '2020', '0', '11943', 'would', 'rather', 'pay', 'cash', '–', 'covid-19', 'would', 'rather', 'make', 'online', 'payment', '–', 'lockdown', 'lockdown', 'one', 'observe', 'number', 'small', 'positive', 'changes', 'surroundings', 'one', 'positive', 'change', 'using', 'online', 'mode', 'payment', 'even', 'small', 'amount', 'counts', 'big', 'step', 'towards', 'digital', 'india', 'self-development', 'well', 'according', 'economic', 'times', '42', 'indians', 'say', 'started', 'using', 'online', 'mode', 'payment', 'small', 'tasks', 'like', 'mobile', 'phone', 'recharge', 'bill', 'payments', 'buying', 'groceries', 'etc.', 'essential', 'tasks', 'ignored', 'making', 'online', 'payment', 'way', 'convenient', 'als

### Positive and Negative Score

In [235]:
### Positive and Negative Score
# Initialize sentiment scores
positive_score = 0
negative_score = 0

# Calculate the positive and negative scores
for word in filtered_tokens:
    if word in positive_words:
        positive_score += 1
    if word in negative_words:
        negative_score += 1

In [234]:
print(f"positive_score: {positive_score}")
print(f"negative_score: {negative_score}")

positive_score: 12
negative_score: 3


### Sentiment, Polarity and Subjectivity score

In [238]:
# Sentiment, Polarity and Subjectivity score
polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
subjectivity_score = (positive_score + negative_score) / (len(filtered_tokens) + 0.000001)

# Determine the sentiment
if polarity_score > 0:
    sentiment = "Positive"
elif polarity_score < 0:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

In [239]:
# Print the results
print("Sentiment: ", sentiment)
print("Polarity Score: ", round(polarity_score,4))
print("Subjectivity Score: ", round(subjectivity_score,4))

Sentiment:  Positive
Polarity Score:  0.8
Subjectivity Score:  0.0735


## Analysis of Readability

### Average Sentence Length

In [247]:
### Average Sentence Length
sentences = nltk.sent_tokenize(article)
words = filtered_tokens
average_sentence_length = len(words) / len(sentences)

In [248]:
print(f"average_sentence_length: {average_sentence_length}")

average_sentence_length: 12.0


In [243]:
sentences

### Percentage of Complex Words

In [249]:
### Percentage of Complex Words
from nltk.corpus import cmudict

# Download the CMU Pronouncing Dictionary
nltk.download('cmudict')
d = cmudict.dict()

def syllable_count(word):
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        # Handle words not found in the CMU Pronouncing Dictionary
        return 0

# Calculate percentage of complex words
complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
percentage_complex_words = (complex_word_count / len(words)) * 100

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [250]:
print(f"percentage_complex_words: {percentage_complex_words}")

percentage_complex_words: 20.34313725490196


### Fog Index

In [251]:
### Fog Index
fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

In [252]:
print(f"fog_index: {fog_index}")

fog_index: 12.937254901960785


## Average Number of Words Per Sentence

In [None]:
### Average Number of Words Per Sentence
average_words_per_sentence = len(words) / len(sentences)

In [None]:
print(f"average_words_per_sentence: {average_words_per_sentence}")

average_words_per_sentence: 22.235294117647058


## Complex Word Count

In [None]:
### Complex Word Count
complex_word_count = sum(1 for word in words if syllable_count(word) > 2)

In [None]:
print(f"complex_word_count: {complex_word_count}")

complex_word_count: 83


## Total Word Count

In [None]:
print(f"{len(words)}, {words}")

756, ['How', 'COVID-19', 'is', 'impacting', 'payment', 'preferences', '?', 'Sunday', ',', 'October', '15', ',', '2023', 'Sign', 'in', '/', 'Join', 'Search', 'HomeData', 'ScienceBig', 'DataHow', 'COVID-19', 'is', 'impacting', 'payment', 'preferences', '?', 'How', 'COVID-19', 'is', 'impacting', 'payment', 'preferences', '?', 'By', 'Ajay', 'Bidyarthy', 'June', '22', ',', '2020', '0', '11943', 'I', 'would', 'rather', 'pay', 'cash', '–', 'Before', 'COVID-19', '.', 'I', 'would', 'rather', 'make', 'online', 'payment', '–', 'After', 'lockdown', '.', 'During', 'this', 'lockdown', ',', 'one', 'can', 'observe', 'a', 'number', 'of', 'small', 'positive', 'changes', 'in', 'our', 'surroundings', '.', 'One', 'such', 'positive', 'change', 'is', 'using', 'online', 'mode', 'of', 'payment', 'even', 'if', 'they', 'are', 'small', 'in', 'amount', 'as', 'it', 'counts', 'as', 'a', 'big', 'step', 'towards', 'DIGITAL', 'INDIA', 'and', 'self-development', 'as', 'well', '.', 'According', 'to', 'Economic', 'Times',

In [256]:
### Total Word Count
total_word_count = len([
    word for word in words if word not in custom_stop_words and word not in english_stop_words and not all(char in string.punctuation for char in word)
    ])

In [257]:
print(f"total_word_count: {total_word_count}")

total_word_count: 314


## Syllable Count Per Word

We count the number of Syllables in each word of the text by counting the vowels present in each word. We also handle some exceptions like words ending with `"es","ed"` by not counting them as a syllable.

In [261]:
### Syllable Count Per Word
def count_syllables(word):
  exceptions = ["es", "ed"]
  for ending in exceptions:
    if word.endswith(ending):
      word = word[:-len(ending)]

  vowels = "aeiouAEIOU"
  syllable_count = sum(1 for char in word if char in vowels)
  return max(1, syllable_count)

syllables_per_word = [count_syllables(word) for word in words]
average_syllables_per_word = sum(syllables_per_word) / len(words)

In [262]:
print(f"{len(syllables_per_word)}, {syllables_per_word}")
print("Average Syllables Per Word:", average_syllables_per_word)

408, [2, 3, 2, 3, 2, 3, 1, 1, 1, 2, 2, 4, 4, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 4, 1, 4, 2, 4, 2, 2, 3, 2, 2, 2, 1, 3, 2, 1, 1, 2, 3, 3, 5, 1, 3, 4, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 4, 1, 2, 2, 3, 2, 1, 4, 2, 3, 1, 4, 3, 3, 3, 2, 2, 1, 2, 3, 2, 1, 3, 3, 3, 2, 1, 2, 3, 2, 2, 1, 1, 2, 1, 1, 3, 3, 1, 4, 2, 3, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 2, 2, 1, 3, 1, 2, 3, 4, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, 3, 2, 2, 1, 2, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 3, 2, 3, 4, 2, 3, 2, 1, 3, 1, 2, 3, 2, 3, 1, 4, 1, 2, 2, 2, 4, 2, 4, 2, 3, 3, 1, 4, 2, 2, 1, 3, 2, 2, 2, 4, 2, 2, 2, 2, 1, 1, 3, 4, 2, 4, 2, 1, 3, 1, 1, 4, 2, 2, 2, 4, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 3, 3, 3, 2, 1, 1, 2, 3, 2, 3, 1, 4, 1, 2, 3, 2, 1, 2, 3, 2, 4, 2, 3, 2, 3, 4, 1, 5, 2, 3, 2, 1, 3, 3, 2, 2, 1, 2, 2, 4, 2, 3, 1, 3, 2, 1, 5, 2, 6, 4, 2, 3, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 1, 4, 3, 2, 2, 1, 1, 3, 2

## Personal Pronouns

To calculate Personal Pronouns mentioned in the text, we use regex to find the counts of the words - `“I,” “we,” “my,” “ours,” and “us”`. Special care is taken so that the country name `US` is not included in the list.

In [None]:
import re
pattern = r'\b(?!(US\b))([Ii]|[Ww]e|[Mm]y|[Oo]urs|[Uu]s)\b'

In [263]:
### Calculate personal pronoun count
personal_pronouns = re.findall(
    pattern = pattern,
    string = article
    )
personal_pronoun_count = len(personal_pronouns)

In [264]:
print(f"personal_pronouns length: {personal_pronoun_count}")

personal_pronouns length: 4


## Average Word Length

Average Word Length is calculated by the formula:
```python
Sum of the total number of characters in each word/Total number of words
```

In [None]:
print([len(word) for word in words])

[3, 8, 2, 9, 7, 11, 1, 6, 1, 7, 2, 1, 4, 4, 2, 1, 4, 6, 8, 10, 7, 8, 2, 9, 7, 11, 1, 3, 8, 2, 9, 7, 11, 1, 2, 4, 9, 4, 2, 1, 4, 1, 5, 1, 5, 6, 3, 4, 1, 6, 8, 1, 1, 5, 6, 4, 6, 7, 1, 5, 8, 1, 6, 4, 8, 1, 3, 3, 7, 1, 6, 2, 5, 8, 7, 2, 3, 12, 1, 3, 4, 8, 6, 2, 5, 6, 4, 2, 7, 4, 2, 4, 3, 5, 2, 6, 2, 2, 6, 2, 1, 3, 4, 7, 7, 5, 3, 16, 2, 4, 1, 9, 2, 8, 5, 1, 2, 1, 2, 7, 3, 4, 4, 4, 7, 5, 6, 4, 2, 7, 1, 4, 5, 5, 4, 6, 5, 8, 1, 4, 8, 1, 6, 9, 1, 4, 1, 3, 4, 9, 5, 4, 3, 3, 2, 7, 1, 3, 6, 2, 6, 7, 2, 3, 3, 10, 3, 4, 1, 4, 1, 8, 7, 4, 4, 9, 2, 3, 10, 2, 7, 6, 8, 4, 5, 6, 6, 3, 4, 4, 7, 6, 1, 3, 7, 2, 3, 4, 3, 2, 7, 1, 5, 6, 6, 1, 5, 6, 2, 2, 4, 2, 2, 5, 2, 6, 5, 9, 3, 1, 6, 2, 3, 11, 3, 3, 3, 6, 4, 2, 8, 4, 3, 6, 6, 5, 2, 1, 4, 1, 2, 6, 3, 2, 5, 1, 1, 3, 5, 2, 6, 12, 4, 2, 8, 2, 4, 1, 1, 4, 4, 7, 6, 3, 9, 3, 3, 7, 15, 6, 1, 2, 5, 1, 3, 2, 5, 4, 2, 8, 6, 1, 3, 10, 4, 4, 2, 3, 9, 3, 3, 6, 5, 2, 4, 3, 2, 7, 1, 6, 2, 1, 6, 4, 2, 1, 8, 1, 1, 6, 4, 2, 1, 5, 1, 1, 5, 5, 2, 1, 3, 4, 3, 7, 3, 4, 3, 6, 4, 

In [265]:
### Average Word Length
total_characters = sum(len(word) for word in words)
average_word_length = total_characters / len(words)

In [266]:
print("average_word_length:", average_word_length)

average_word_length: 6.127450980392157


In [267]:
# Print the results
print("Average Sentence Length:", average_sentence_length)
print("Percentage of Complex Words:", percentage_complex_words)
print("Fog Index:", fog_index)
print("Average Words Per Sentence:", average_words_per_sentence)
print("Personal Pronoun Count:", count_personal_pronouns)
print("Average Word Length:", average_word_length)

Average Sentence Length: 12.0
Percentage of Complex Words: 20.34313725490196
Fog Index: 12.937254901960785
Average Words Per Sentence: 22.235294117647058
Personal Pronoun Count: 4
Average Word Length: 6.127450980392157


# Whole Text Analysis Code in One Place

In [None]:
# Create the result_dict and populate it with the calculated values
result_dict = {}

# Calculate the values
# ...

# Populate the result_dict with the calculated values
result_dict["POSITIVE SCORE"] = positive_score
result_dict["NEGATIVE SCORE"] = negative_score
result_dict["POLARITY SCORE"] = polarity_score
result_dict["SUBJECTIVITY SCORE"] = subjectivity_score
result_dict["AVG SENTENCE LENGTH"] = average_sentence_length
result_dict["PERCENTAGE OF COMPLEX WORDS"] = percentage_complex_words
result_dict["FOG INDEX"] = fog_index
result_dict["AVG NUMBER OF WORDS PER SENTENCE"] = average_words_per_sentence
result_dict["COMPLEX WORD COUNT"] = complex_word_count
result_dict["WORD COUNT"] = total_word_count
result_dict["SYLLABLE PER WORD"] = average_syllables_per_word
result_dict["PERSONAL PRONOUNS"] = personal_pronoun_count
result_dict["AVG WORD LENGTH"] = average_word_length

# Print the result_dict
for key, value in result_dict.items():
    print(f"{key}: {value}")

In [272]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import cmudict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')
cumdict = cmudict.dict()

articles = extracted_files

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [319]:
for article_file in articles:
  with open(file = article_file, mode = 'r') as f:
    article = f.read()
    result_dict = {}

    url_id = str(article_file).split("/")[-1]

    # Tokenize the text
    tokens = word_tokenize(article)
    tokens = [token.lower() for token in tokens]

    english_stop_words = set(stopwords.words('english'))
    english_stop_words = set(word.lower() for word in english_stop_words)

    # Define a set of punctuation characters to remove
    punctuation = set(string.punctuation)
    punctuation = set(word.lower() for word in punctuation)

    # Clean the tokens by removing stop words
    filtered_tokens = list()
    for word in tokens:
        if (word not in english_stop_words) and (word not in custom_stopwords) and (word not in punctuation):
            filtered_tokens.append(word)

    # Positive and Negative Score
    positive_score = 0
    negative_score = 0

    # Calculate the positive and negative scores
    for word in filtered_tokens:
        if word in positive_words:
            positive_score += 1
        if word in negative_words:
            negative_score += 1
    output_df.loc[url_id,"POSITIVE SCORE"] = positive_score
    output_df.loc[url_id,"NEGATIVE SCORE"] = negative_score

    # Sentiment, Polarity and Subjectivity score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(filtered_tokens) + 0.000001)

    output_df.loc[url_id,"POLARITY SCORE"] = polarity_score
    output_df.loc[url_id,"SUBJECTIVITY SCORE"] = subjectivity_score

    # Determine the sentiment
    if polarity_score > 0:
        sentiment = "Positive"
    elif polarity_score < 0:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    ### Average Sentence Length
    sentences = nltk.sent_tokenize(article)
    words = filtered_tokens
    average_sentence_length = len(words) / len(sentences)

    output_df.loc[url_id,"AVG SENTENCE LENGTH"] = average_sentence_length

    ### Percentage of Complex Words
    def syllable_count(word):
        if word.lower() in d:
            return max([len(list(y for y in x if y[-1].isdigit())) for x in cumdict[word.lower()]])
        else:
            return 0

    words = filtered_tokens
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
    percentage_complex_words = (complex_word_count / len(words)) * 100

    output_df.loc[url_id,"PERCENTAGE OF COMPLEX WORDS"] = percentage_complex_words

    # Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    output_df.loc[url_id,"FOG INDEX"] = fog_index

    # Average Number of Words Per Sentence
    average_words_per_sentence = len(words) / len(sentences)

    output_df.loc[url_id,"AVG NUMBER OF WORDS PER SENTENCE"] = average_words_per_sentence

    # Complex Word Count
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)

    output_df.loc[url_id,"COMPLEX WORD COUNT"] = complex_word_count

    # Total Word Count
    total_word_count = len([
        word for word in words if word not in custom_stopwords and word not in english_stop_words and not all(char in string.punctuation for char in word)
        ])

    output_df.loc[url_id,"WORD COUNT"] = total_word_count

    # Syllable Count Per Word
    def count_syllables(word):
        exceptions = ["es", "ed"]
        for ending in exceptions:
            if word.endswith(ending):
                word = word[:-len(ending)]

        vowels = "aeiouAEIOU"
        syllable_count = sum(1 for char in word if char in vowels)
        return max(1, syllable_count)

    syllables_per_word = [count_syllables(word) for word in words]
    average_syllables_per_word = sum(syllables_per_word) / len(words)

    output_df.loc[url_id,"SYLLABLE PER WORD"] = average_syllables_per_word

    # Calculate personal pronoun count
    personal_pronouns = re.findall(
        pattern = pattern,
        string = article
        )
    personal_pronoun_count = len(personal_pronouns)

    output_df.loc[url_id,"PERSONAL PRONOUNS"] = personal_pronoun_count

    ### Average Word Length
    total_characters = sum(len(word) for word in words)
    average_word_length = total_characters / len(words)

    output_df.loc[url_id,"AVG WORD LENGTH"] = average_word_length


In [320]:
output_df

Unnamed: 0_level_0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH,file_name
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
123.0.txt,123.0,https://insights.blackcoffer.com/rise-of-telem...,88.0,24.0,0.571429,0.110563,12.662500,37.018756,19.872502,12.662500,375.0,1013.0,2.880553,2.0,7.360316,123.0.txt
321.0.txt,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,41.0,13.0,0.518519,0.156977,13.760000,39.534884,21.317953,13.760000,136.0,344.0,2.915698,3.0,7.595930,321.0.txt
2345.0.txt,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,26.0,27.0,-0.018868,0.076479,10.043478,32.756133,17.119844,10.043478,227.0,693.0,2.548341,4.0,6.992785,2345.0.txt
4321.0.txt,4321.0,https://insights.blackcoffer.com/rise-of-telem...,44.0,27.0,0.239437,0.091377,12.950000,34.877735,19.131094,12.950000,271.0,777.0,2.584299,7.0,7.038610,4321.0.txt
432.0.txt,432.0,https://insights.blackcoffer.com/rise-of-telem...,44.0,27.0,0.239437,0.091377,12.950000,34.877735,19.131094,12.950000,271.0,777.0,2.584299,7.0,7.038610,432.0.txt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50921.0.txt,50921.0,https://insights.blackcoffer.com/coronavirus-i...,8.0,29.0,-0.567568,0.082774,14.900000,28.859060,17.503624,14.900000,129.0,447.0,2.387025,1.0,6.590604,50921.0.txt
51382.8.txt,51382.8,https://insights.blackcoffer.com/coronavirus-i...,27.0,66.0,-0.419355,0.082667,22.500000,23.466667,18.386667,22.500000,264.0,1125.0,2.378667,3.0,6.352889,51382.8.txt
51844.6.txt,51844.6,https://insights.blackcoffer.com/what-are-the-...,101.0,33.0,0.507463,0.126534,14.915493,30.878187,18.317472,14.915493,327.0,1059.0,2.472144,0.0,6.813975,51844.6.txt
52306.4.txt,52306.4,https://insights.blackcoffer.com/marketing-dri...,35.0,22.0,0.228070,0.071161,13.576271,29.463171,17.215777,13.576271,236.0,801.0,2.483146,8.0,6.756554,52306.4.txt


In [321]:
output_df.to_excel("output.xlsx", index=False)