In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install textstat

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!







Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')








In [2]:
df = pd.read_excel("Input.xlsx")
df.head()

Unnamed: 0,URL_ID,URL
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...
3,40.0,https://insights.blackcoffer.com/will-machine-...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...


In [None]:
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    article = soup.find('article')
    if article is None:
        print(f"No article found on webpage: {url}")
        continue
    title = article.find('h1').get_text().strip()
    text = '\n'.join([p.get_text().strip() for p in article.find_all('p')])
    with open(f"{url_id}.txt", 'w', encoding='utf-8') as file:
        file.write(title + '\n\n' + text)

### Removing Stopwords

In [None]:
stopwords_dir = "stopwords"
stopwords_set = set()

# Load stopwords from all files in stopwords_dir
for filename in os.listdir(stopwords_dir):
    with open(os.path.join(stopwords_dir, filename), 'r', encoding='latin1') as file:
        stopwords_set.update(word.strip().lower() for word in file.readlines())

# Loop over each article file and remove stopwords
for index, row in df.iterrows():
    url_id = row['URL_ID']
    try:
        with open(f"{url_id}.txt", 'r', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError:
        print(f"WARNING: File {url_id}.txt not found, skipping.")
        continue
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords_set]
    filtered_text = ' '.join(filtered_tokens)
    with open(f"{url_id}_cleaned.txt", 'w', encoding='utf-8') as file:
        file.write(filtered_text)

### Sentiment Analysis

In [3]:
# Path to the directory containing the cleaned article text files
cleaned_dir = './Cleaned_Article_Texts/'

# Path to the directory containing the master dictionary text files
dictionary_dir = './MasterDictionary/'

# Load positive and negative words from master dictionary
positive_words = set()
negative_words = set()
with open(os.path.join(dictionary_dir, 'positive-words.txt'), 'r', encoding='latin1') as file:
    positive_words.update(word.strip().lower() for word in file.readlines())
with open(os.path.join(dictionary_dir, 'negative-words.txt'), 'r', encoding='latin1') as file:
    negative_words.update(word.strip().lower() for word in file.readlines())

# Create a list of dictionaries to store the positive and negative word counts for each URL id
word_counts = []
for filename in os.listdir(cleaned_dir):
    if filename.endswith('_cleaned.txt'):
        url_id = filename.split('_')[0]
        with open(os.path.join(cleaned_dir, filename), 'r', encoding='latin1') as file:
            text = file.read()
        # Split the cleaned text into words
        words = text.split()
        # Count the number of positive and negative words
        positive_count = sum(1 for word in words if word.lower() in positive_words)
        negative_count = sum(1 for word in words if word.lower() in negative_words)
        # Add the word counts to the list of dictionaries
        word_counts.append({'URL_ID': url_id, 'Positive_Words': positive_count, 'Negative_Words': negative_count})

# Create a pandas dataframe from the list of dictionaries
df_word_counts = pd.DataFrame(word_counts)

# Print the dataframe
df_word_counts

Unnamed: 0,URL_ID,Positive_Words,Negative_Words
0,100.0,22,41
1,101.0,3,3
2,102.0,24,43
3,103.0,6,24
4,104.0,26,54
...,...,...,...
106,95.0,29,33
107,96.0,16,41
108,97.0,13,41
109,98.0,33,48


In [4]:
df_word_counts = df_word_counts.rename(columns={'Positive_Words': 'Positive Score', 'Negative_Words': 'Negative Score'})
df_word_counts['Polarity Score'] = (df_word_counts['Positive Score'] - df_word_counts['Negative Score']) / ((df_word_counts['Positive Score'] + df_word_counts['Negative Score']) + 0.000001)
df_word_counts

Unnamed: 0,URL_ID,Positive Score,Negative Score,Polarity Score
0,100.0,22,41,-0.301587
1,101.0,3,3,0.000000
2,102.0,24,43,-0.283582
3,103.0,6,24,-0.600000
4,104.0,26,54,-0.350000
...,...,...,...,...
106,95.0,29,33,-0.064516
107,96.0,16,41,-0.438596
108,97.0,13,41,-0.518519
109,98.0,33,48,-0.185185


In [5]:
# Read cleaned article texts and calculate total words after cleaning
df_word_counts['Cleaned Text'] = [open(f'Cleaned_Article_Texts/{url_id}_cleaned.txt', 'r', encoding='utf-8').read() for url_id in df_word_counts['URL_ID']]
df_word_counts['Total Words'] = df_word_counts['Cleaned Text'].apply(lambda x: len(x.split()))

# Calculate subjectivity score
df_word_counts['Subjectivity Score'] = (df_word_counts['Positive Score'] + df_word_counts['Negative Score']) / (df_word_counts['Total Words'] + 0.000001).drop(columns=['Cleaned Text', 'Total Words'], axis=1)
df_word_counts = df_word_counts.drop(columns=['Cleaned Text', 'Total Words'], axis=1)
df1 = df_word_counts.copy()
df1

Unnamed: 0,URL_ID,Positive Score,Negative Score,Polarity Score,Subjectivity Score
0,100.0,22,41,-0.301587,0.081606
1,101.0,3,3,0.000000,0.069767
2,102.0,24,43,-0.283582,0.112228
3,103.0,6,24,-0.600000,0.073350
4,104.0,26,54,-0.350000,0.127389
...,...,...,...,...,...
106,95.0,29,33,-0.064516,0.105983
107,96.0,16,41,-0.438596,0.056604
108,97.0,13,41,-0.518519,0.099083
109,98.0,33,48,-0.185185,0.130225


### Readability Analysis

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy

nlp = spacy.load('en_core_web_sm')

articles_dir = "Extracted article texts"
df2 = pd.DataFrame(columns=["URL_ID", "Avg_Sentence_Length", "Percent_Complex_Words", "Fog_Index"])

for filename in os.listdir(articles_dir):
    url_id = os.path.splitext(filename)[0]
    with open(os.path.join(articles_dir, filename), 'r', encoding='latin1') as file:
        text = file.read()
        sentences = sent_tokenize(text)
        words = word_tokenize(text)
        
        # Average Sentence Length
        avg_sentence_length = len(words) / len(sentences)
        
        # Percentage of Complex Words
        complex_words = set()
        doc = nlp(text)
        for token in doc:
            if token.is_alpha and not token.is_stop and len(token.text) > 2 and token.pos_ in {'ADJ', 'ADV'}:
                complex_words.add(token.text.lower())
        num_complex_words = sum([1 for word in words if word.lower() in complex_words])
        percent_complex_words = (num_complex_words / len(words)) * 100
        
        # Fog Index
        fog_index = 0.4 * (avg_sentence_length + percent_complex_words)
        
        # Add to DataFrame
        df2 = df2.append({"URL_ID": url_id, "Avg_Sentence_Length": avg_sentence_length,
                        "Percent_Complex_Words": percent_complex_words, "Fog_Index": fog_index}, ignore_index=True)

df2.head()

Unnamed: 0,URL_ID,Avg_Sentence_Length,Percent_Complex_Words,Fog_Index
0,100.0,23.571429,9.469697,13.21645
1,101.0,20.6,5.825243,10.570097
2,102.0,20.066667,8.554817,11.448594
3,103.0,20.514286,6.267409,10.712678
4,104.0,24.612245,10.53068,14.05717


In [20]:
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
import syllables

nlp = spacy.load('en_core_web_sm')

articles_dir = "Extracted article texts"

data = {'URL': [], 'Avg Sentence Length': [], 'Percentage of Complex Words': [], 'Fog Index': [], 'Average Number of Words per Sentence': [], 'Complex Word Count': []}

for filename in os.listdir(articles_dir):
    url_id = filename[:-4]
    with open(os.path.join(articles_dir, filename), 'r', encoding='latin1') as file:
        text = file.read()
        sentences = sent_tokenize(text)
        words = word_tokenize(text)

        # Average Sentence Length
        avg_sentence_length = len(words) / len(sentences)

        # Percentage of Complex Words
        complex_words = set()
        for filename in os.listdir(articles_dir):
            with open(os.path.join(articles_dir, filename), 'r', encoding='latin1') as file:
                text = file.read()
                doc = nlp(text)
                for token in doc:
                    if token.is_alpha and not token.is_stop and len(token.text) > 2 and token.pos_ in {'ADJ', 'ADV'}:
                        complex_words.add(token.text.lower())
        num_complex_words = sum([1 for word in words if syllables.estimate(word) > 2 and word.lower() in complex_words])
        percent_complex_words = (num_complex_words / len(words)) * 100

        # Fog Index
        fog_index = 0.4 * (avg_sentence_length + percent_complex_words)

        # Average Number of Words per Sentence
        avg_words_per_sentence = len(words) / len(sentences)

        # Complex Word Count
        complex_word_count = sum([1 for word in words if syllables.estimate(word) > 2])

        data['URL'].append(url_id)
        data['Avg Sentence Length'].append(avg_sentence_length)
        data['Percentage of Complex Words'].append(percent_complex_words)
        data['Fog Index'].append(fog_index)
        data['Average Number of Words per Sentence'].append(avg_words_per_sentence)
        data['Complex Word Count'].append(complex_word_count)

df4 = pd.DataFrame(data)

df4.head(20)

Unnamed: 0,URL,Avg Sentence Length,Percentage of Complex Words,Fog Index,Average Number of Words per Sentence,Complex Word Count
0,100.0,23.571429,6.287879,11.943723,23.571429,275
1,101.0,20.6,3.398058,9.599223,20.6,27
2,102.0,20.066667,5.066445,10.053245,20.066667,218
3,103.0,20.514286,3.899721,9.765603,20.514286,141
4,104.0,24.612245,7.296849,12.763638,24.612245,284
5,105.0,32.0,3.125,14.05,32.0,184
6,106.0,34.8,3.448276,15.29931,34.8,31
7,107.0,21.5,5.481728,10.792691,21.5,95
8,108.0,33.866667,6.397638,16.105722,33.866667,211
9,109.0,26.166667,1.528662,11.078132,26.166667,79


In [9]:
import re

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    prev_char_was_vowel = False
    for char in word:
        char = char.lower()
        if char in vowels:
            if not prev_char_was_vowel:
                count += 1
                prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False
            if char == 'e':
                if count == 0:
                    count += 1
                if count > 1:
                    count -= 1
    if count == 0:
        count += 1
    return count

def count_personal_pronouns(text):
    count = 0
    personal_pronouns = ['i', 'we', 'my', 'ours', 'us']
    for pronoun in personal_pronouns:
        count += len(re.findall(rf"\b{pronoun}\b", text, flags=re.IGNORECASE))
    return count

cleaned_articles_dir = "Cleaned_Article_Texts"
df3 = pd.DataFrame(columns=['url_id', 'word_count', 'syllables_per_word', 'personal_pronouns', 'avg_word_length'])

for filename in os.listdir(cleaned_articles_dir):
    url_id = filename.split("_")[0]
    with open(os.path.join(cleaned_articles_dir, filename), 'r', encoding='latin1') as file:
        text = file.read()
        words = re.findall(r'\w+', text)
        cleaned_words = [word for word in words if word.isalpha()]
        word_count = len(cleaned_words)
        
        # Syllable Count Per Word
        syllables_per_word = [count_syllables(word) for word in cleaned_words]
        
        # Personal Pronouns
        personal_pronouns = count_personal_pronouns(text)
        
        # Average Word Length
        total_word_length = sum([len(word) for word in cleaned_words])
        avg_word_length = total_word_length / len(cleaned_words)
        
        # Add results to dataframe
        df3 = df3.append({'url_id': url_id, 
                                        'word_count': word_count,
                                        'syllables_per_word': syllables_per_word,
                                        'personal_pronouns': personal_pronouns,
                                        'avg_word_length': avg_word_length}, 
                                        ignore_index=True)

In [21]:
df3.head()

Unnamed: 0,url_id,word_count,syllables_per_word,personal_pronouns,avg_word_length
0,100.0,606,"[4, 2, 2, 1, 1, 1, 5, 5, 5, 4, 3, 1, 1, 1, 2, ...",1,6.859736
1,101.0,66,"[2, 2, 2, 2, 3, 5, 2, 3, 1, 1, 1, 3, 3, 1, 2, ...",0,6.30303
2,102.0,482,"[2, 3, 2, 2, 1, 2, 3, 2, 1, 1, 2, 1, 5, 3, 1, ...",0,6.761411
3,103.0,293,"[4, 4, 1, 2, 3, 3, 2, 3, 5, 4, 2, 1, 3, 1, 3, ...",0,6.771331
4,104.0,550,"[1, 2, 4, 4, 4, 3, 2, 3, 1, 2, 3, 3, 3, 2, 2, ...",2,7.18


In [24]:
df1['Avg Sentence Length'] = df4['Avg Sentence Length']
df1['Percentage of Complex Words'] = df4['Percentage of Complex Words']
df1['Fog Index'] = df4['Fog Index']
df1['Average Number of Words Per Sentence'] = df4['Average Number of Words per Sentence']
df1['Complex Word Count'] = df4['Complex Word Count']
df1['Word Count'] = df3['word_count']
df1['Syllables Per Word'] = df3['syllables_per_word']
df1['Personal Pronouns'] = df3['personal_pronouns']
df1['Average Word Length'] = df3['avg_word_length']

df1.head()

Unnamed: 0,URL_ID,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Avg Sentence Length,Percentage of Complex Words,Fog Index,Average Number of Words Per Sentence,Complex Word Count,Word Count,Syllables Per Word,Personal Pronouns,Average Word Length
0,100.0,22,41,-0.301587,0.081606,23.571429,6.287879,11.943723,23.571429,275,606,"[4, 2, 2, 1, 1, 1, 5, 5, 5, 4, 3, 1, 1, 1, 2, ...",1,6.859736
1,101.0,3,3,0.0,0.069767,20.6,3.398058,9.599223,20.6,27,66,"[2, 2, 2, 2, 3, 5, 2, 3, 1, 1, 1, 3, 3, 1, 2, ...",0,6.30303
2,102.0,24,43,-0.283582,0.112228,20.066667,5.066445,10.053245,20.066667,218,482,"[2, 3, 2, 2, 1, 2, 3, 2, 1, 1, 2, 1, 5, 3, 1, ...",0,6.761411
3,103.0,6,24,-0.6,0.07335,20.514286,3.899721,9.765603,20.514286,141,293,"[4, 4, 1, 2, 3, 3, 2, 3, 5, 4, 2, 1, 3, 1, 3, ...",0,6.771331
4,104.0,26,54,-0.35,0.127389,24.612245,7.296849,12.763638,24.612245,284,550,"[1, 2, 4, 4, 4, 3, 2, 3, 1, 2, 3, 3, 3, 2, 2, ...",2,7.18


In [25]:
df1.to_excel('Final.xlsx')