###Importing dependencies

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
from nltk import pos_tag
from nltk.tokenize import SyllableTokenizer


Importing the input file

In [2]:
input_df = pd.read_excel("/content/drive/MyDrive/20211030 Test Assignment/Input.xlsx")
input_df.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


### Defining functions for data extraction
Here I have defined two functions for file extraction

1.   The extract_text function which takes url as inputs and gives the title and text of the particular article
2.   save_to_file function which saves the aforementioned title and text to a new text file with its url_id as the file name



In [3]:
#Define functions for data extraction......
def extract_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting article text
    article_text = ""
    for paragraph in soup.find_all('p'):
        article_text += paragraph.get_text() + ' '

    # Extracting article title
    title = soup.title.string

    return title, article_text

def save_to_file(url_id, title, text):
    filename = f'{url_id}.txt'
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f'Title: {title}\n\n{text}')

Now, we loop over all urls and save them in the respective files

In [4]:
#Extract and Save Article Texts
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, article_text = extract_text(url)
    save_to_file(url_id, title, article_text)

### Loading the stopwords , positive-words , negative-words dictionary

Loading the stopwords folder and then converting all files in it to stopwords dictionary.

In [5]:
import os
def load_stopwords_from_folder(folder_path, encoding='latin-1'):
    custom_stopwords = set()

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding=encoding) as file:
                custom_stopwords.update(word.strip() for word in file)

    return custom_stopwords
stopwords = load_stopwords_from_folder("/content/drive/MyDrive/20211030 Test Assignment/StopWords")

Loading the positive-words and negative-words folder and then converting these files to their python dictionary form.

In [6]:
def load_from_file(file_path , encoding = "latin-1"):
    with open(file_path, 'r', encoding=encoding) as file:
        return set(word.strip() for word in file)

negative_words = load_from_file("/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/negative-words.txt")
positive_words = load_from_file("/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/positive-words.txt")

### Calculating all the Sentiment analysis parameters and Readability analysis parameters.

In [7]:
# main function for text analysis
def calculate_all_metrics(text, stopwords, positive_words, negative_words):
    # Tokenize sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())

    # Remove custom stopwords
    filtered_words = [word for word in words if word.isalnum() and word not in stopwords]

    # Calculate metrics
    num_sentences = len(sentences)
    num_words = len(filtered_words)
    avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    # Calculate complex word count
    tagged_words = pos_tag(filtered_words)
    complex_word_count = sum(1 for word, pos in tagged_words if pos in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS", "RB", "RBR", "RBS"])

    # Calculate syllable count per word
    syllable_tokenizer = SyllableTokenizer()
    syllable_count = sum(len(syllable_tokenizer.tokenize(word)) for word in filtered_words)

    # Personal pronouns
    personal_pronouns = sum(1 for word, pos in tagged_words if pos in ["PRP", "PRP$", "WP", "WP$"])

    # Average word length
    avg_word_length = sum(len(word) for word in filtered_words) / num_words if num_words > 0 else 0

    # Calculate sentiment scores
    blob = TextBlob(" ".join(filtered_words))
    positive_score = sum(1 for word in filtered_words if word in positive_words)
    negative_score = sum(1 for word in filtered_words if word in negative_words)
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score)/ ((len(filtered_words)) + 0.000001)

    # Additional metrics
    avg_sentence_length = sum(len(sent.split()) for sent in sentences) / num_sentences if num_sentences > 0 else 0
    percentage_complex_words = (complex_word_count / num_words) * 100 if num_words > 0 else 0
    fog_index = 0.4 * (avg_words_per_sentence + percentage_complex_words)

    return {
        "Average Words per Sentence": avg_words_per_sentence,
        "Complex Word Count": complex_word_count,
        "Word Count": num_words,
        "Syllable Count per Word": syllable_count / num_words if num_words > 0 else 0,
        "Personal Pronouns": personal_pronouns,
        "Average Word Length": avg_word_length,
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Average Sentence Length": avg_sentence_length,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index
    }




 Now ,Dividing the total parameters in sentiment and text metrics parameters

In [8]:
def calculate_sentiment(text, stopwords, positive_words, negative_words):
  return ( calculate_all_metrics(text, stopwords, positive_words, negative_words)["Positive Score"] ,
           calculate_all_metrics(text, stopwords, positive_words, negative_words)["Negative Score"] ,
           calculate_all_metrics(text, stopwords, positive_words, negative_words)["Polarity Score"] ,
           calculate_all_metrics(text, stopwords, positive_words, negative_words)["Subjectivity Score"]
          )

def calculate_text_metrics(text, stopwords, positive_words, negative_words):
  return (calculate_all_metrics(text, stopwords, positive_words, negative_words)["Average Sentence Length"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Percentage of Complex Words"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Fog Index"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Average Words per Sentence"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Complex Word Count"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Word Count"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Syllable Count per Word"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Personal Pronouns"] ,
          calculate_all_metrics(text, stopwords, positive_words, negative_words)["Average Word Length"] ,
          )

### Output
Generating the output file in the format of Output Data Structure and saving it as excel file.

In [11]:
#Generating output file
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

output_df = input_df.copy()
output_df[['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE']] = input_df['URL'].apply(lambda x : calculate_sentiment(extract_text(x)[1] , stopwords, positive_words, negative_words)).apply(pd.Series)
output_df[['AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS' , 'AVG WORD LENGTH']] = input_df['URL'].apply(lambda x : calculate_text_metrics(extract_text(x)[1] , stopwords, positive_words, negative_words)).apply(pd.Series)

# Save the output to Excel file
output_df.to_excel("Output.xlsx", index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [12]:
output_df = pd.read_excel("/content/Output.xlsx")
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,11,5,0.375000,0.055944,18.344828,34.265734,17.651121,9.862069,98,286,2.377622,1,6.559441
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,60,35,0.263158,0.109070,19.402439,42.250287,21.148895,10.621951,368,871,2.607348,0,7.268657
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,43,28,0.211268,0.097796,20.491803,42.699725,21.840546,11.901639,310,726,2.891185,0,7.949036
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,41,79,-0.316667,0.165975,22.071429,40.387275,21.319196,12.910714,292,723,2.791148,0,7.827109
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,26,12,0.368421,0.079332,20.162791,37.160752,19.320115,11.139535,178,479,2.467641,0,7.175365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,32,61,-0.311828,0.136966,24.166667,39.175258,20.699733,12.574074,266,679,2.502209,0,7.097202
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,26,39,-0.200000,0.120148,30.047619,35.489834,19.348314,12.880952,192,541,2.365989,0,6.643253
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,5,4,0.111111,0.041667,36.888889,33.796296,23.118519,24.000000,73,216,2.351852,0,6.652778
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,18,7,0.440000,0.067204,22.781250,36.021505,19.058602,11.625000,134,372,2.327957,0,6.481183
