In [1]:

# The code is importing various libraries and modules that will be used in the code.
import pandas as pd
import numpy as np
import nltk
import string
import warnings
import re
warnings.filterwarnings('ignore')

In [2]:

# The code defines a function called `generate_stopwords()` that reads multiple text files and extracts the stopwords from them.
def generate_stopwords():
    parent_path = "Data/internship_files/StopWords/StopWords_"
    child_paths = ['Auditor', 'Currencies', 'DatesandNumbers','Generic', 'GenericLong', 'Geographic', 'Names']
    words = []
    for child in child_paths:
        text = ""
        with open(f"{parent_path}{child}.txt", 'r', encoding = "utf-8", errors = "ignore") as f:
            words.extend([word for line in f for word in line.split() if word not in ['\n', '|']])
    return words

In [3]:
# The code is generating a master dictionary by reading text files containing positive and negative words.
def generate_MasterDict(stopwords):
    parent_path = "Data/internship_files/MasterDictionary/"
    child_paths = ['positive-words', 'negative-words']
    words = []
    for child in child_paths:
        text = ""
        with open(f"{parent_path}{child}.txt", 'r', encoding = "utf-8", errors = "ignore") as f:
            words.append([word for line in f for word in line.split() if word not in stopwords+['\n', '|']])
    return words

In [4]:
def generate_content(url_id)
    text = ""
    with open(f"Data/internship_files/Scrapped_data/url_id-{url_id}.txt", 'r', encoding = "utf-8", errors = "ignore") as f:
        text += f.read()
    sent = nltk.sent_tokenize(text)
    text = text.translate(str.maketrans('','',string.punctuation))
    words = nltk.word_tokenize(text)
    words = [w for w in words if w not in stopwords]
    return sent, words

In [5]:
# The `AvgSyllable` function is a Python function that calculates various metrics related to syllables and complexity in a given list of words.

def AvgSyllable(words):
    plurals = "aeiouAEIOU"
    total_syallable = 0
    complex_words = 0
    for w in words:
        if not w.endswith(('es', 'ed')):
            syallable = sum([1 for c in w if c in plurals])
            if(syallable > 2): complex_words += 1
            total_syallable += syallable
    syllable_per_word = (total_syallable/len(words))
    complex_percent = (complex_words)/len(words)
    fog_index = 0.4 * (syllable_per_word + complex_percent)
    return syllable_per_word,  complex_words, complex_percent, fog_index

In [6]:
# The code is defining a regular expression pattern `pronounRegex` that matches common pronouns such as "I", "we", "my", "ours", and "us".
# This function calculating the count of pronouns in the given sentence.
def Calculate_pronouns(sent):
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronouns = []
    for s in sent:
        if s: pronouns.extend(re.findall(pronounRegex, s))
    pronouns_count = sum([1 for w in pronouns])
    return pronouns_count

In [7]:
stopwords = generate_stopwords()
MasterWords = generate_MasterDict(stopwords)
Pos_words = MasterWords[0]
Neg_words = MasterWords[1]
df = pd.read_excel("Data/internship_files/Output Data Structure.xlsx")
df.index = df.URL_ID
df.drop(["URL_ID"],axis = 1, inplace = True)

In [8]:
df.head()

Unnamed: 0_level_0,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
37,https://insights.blackcoffer.com/ai-in-healthc...,,,,,,,,,,,,,
38,https://insights.blackcoffer.com/what-if-the-c...,,,,,,,,,,,,,
39,https://insights.blackcoffer.com/what-jobs-wil...,,,,,,,,,,,,,
40,https://insights.blackcoffer.com/will-machine-...,,,,,,,,,,,,,
41,https://insights.blackcoffer.com/will-ai-repla...,,,,,,,,,,,,,


In [9]:
df.columns

Index(['URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [10]:
# The code snippet is performing several calculations and assignments based on the content generated for a specific URL ID.
for url_id in df.index:
    if url_id not in [44, 57, 144]:
        
        sent, words = generate_content(url_id)
        pos_score = sum([1 for w in words if w in Pos_words])
        neg_score = sum([1 for w in words if w in Neg_words])
        Syllable_per_word , complex_count, complex_percentage, fog_index = AvgSyllable(words)
        pronouns_count = Calculate_pronouns(sent)
        
        
        # The code snippet is assigning values to various columns in a DataFrame `df` based
        # on the calculations and assignments performed in the above code.
        
        df["POSITIVE SCORE"][url_id] =  pos_score
        df["NEGATIVE SCORE"][url_id] = neg_score
        df["POLARITY SCORE"][url_id] = (pos_score - neg_score)/(pos_score + neg_score + 0.000001)
        df['SUBJECTIVITY SCORE'][url_id] = (pos_score + neg_score)/(len(words)+0.000001)
        df['AVG SENTENCE LENGTH'][url_id] = len(words)/len(sent)
        df['PERCENTAGE OF COMPLEX WORDS'][url_id] = complex_percentage
        df['FOG INDEX'][url_id] = fog_index
        df['AVG NUMBER OF WORDS PER SENTENCE'][url_id] = len(words)/len(sent)
        df['COMPLEX WORD COUNT'][url_id] = complex_count
        df["WORD COUNT"][url_id] = len(words)
        df['SYLLABLE PER WORD'][url_id] = Syllable_per_word
        df['PERSONAL PRONOUNS'][url_id] = pronouns_count
        df['AVG WORD LENGTH'][url_id] = sum([len(w) for w in words])/len(words)

In [12]:
df[:20]

Unnamed: 0_level_0,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
37,https://insights.blackcoffer.com/ai-in-healthc...,122.0,66.0,0.297872,0.091573,13.871622,0.476376,1.153434,13.871622,978.0,2053.0,2.407209,2.0,7.44228
38,https://insights.blackcoffer.com/what-if-the-c...,56.0,37.0,0.204301,0.14374,8.0875,0.353941,0.96136,8.0875,229.0,647.0,2.049459,7.0,6.746522
39,https://insights.blackcoffer.com/what-jobs-wil...,64.0,33.0,0.319588,0.107182,10.647059,0.456354,1.115138,10.647059,413.0,905.0,2.331492,2.0,7.335912
40,https://insights.blackcoffer.com/will-machine-...,55.0,23.0,0.410256,0.11288,7.677778,0.383502,1.049493,7.677778,265.0,691.0,2.240232,16.0,6.674385
41,https://insights.blackcoffer.com/will-ai-repla...,48.0,24.0,0.333333,0.082569,10.380952,0.387615,1.037156,10.380952,338.0,872.0,2.205275,14.0,6.876147
42,https://insights.blackcoffer.com/man-and-machi...,43.0,24.0,0.283582,0.110197,10.482759,0.384868,1.040132,10.482759,234.0,608.0,2.215461,18.0,6.886513
43,https://insights.blackcoffer.com/in-future-or-...,26.0,12.0,0.368421,0.100796,8.195652,0.318302,0.92626,8.195652,120.0,377.0,1.997347,7.0,6.795756
44,https://insights.blackcoffer.com/how-neural-ne...,,,,,,,,,,,,,
45,https://insights.blackcoffer.com/how-machine-l...,36.0,13.0,0.469388,0.135359,9.783784,0.370166,0.960221,9.783784,134.0,362.0,2.030387,0.0,6.392265
46,https://insights.blackcoffer.com/deep-learning...,64.0,40.0,0.230769,0.103175,12.292683,0.414683,1.090873,12.292683,418.0,1008.0,2.3125,9.0,6.895833


In [13]:
# It is saving the DataFrame `df` to an Excel file named "Blackcoffer_OUTPUT.xlsx".
df.to_excel("Blackcoffer_OUTPUT.xlsx")