In [30]:
import os
import random
import re
import warnings
import openpyxl

import pandas as pd
import requests
import syllables
from bs4 import BeautifulSoup
from nltk.corpus import stopwords as nltk_sw
import nltk

warnings.filterwarnings('ignore')

In [31]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ppurv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
df = pd.read_excel('Input.xlsx')
urls = list(df['URL'])
urls[:10] , len(urls)

(['https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/',
  'https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/',
  'https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/',
  'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/',
  'https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/',
  'https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/',
  'https://insights.blackcoffer.com/rise-of-cyber-crime-and-its-effects/',
  'https://insights.blackcoffer.com/rise-of-internet-demand-and-its-impact-on-communications-and-alternatives-by-the-year-2035-2/',
  'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-

### Fetching Data

In [33]:
def fetch_web_data(url):
    class_ = ['td-post-content tagdiv-type', 'tdb-block-inner td-fix-index']
    doc = requests.get(url)
    soup = BeautifulSoup(doc.content, "html.parser")
    title = soup.find("h1")
    article = soup.find_all('div',{"class": class_[0]})
    if article :
        res = " "
        for tag in article:
            res+=tag.text.strip()
    else:
        article = soup.find_all("div", {"class": class_[1]})
        res = " "
        for tag in article:
            res+=tag.text.strip()
    try:
        start = res.index("Introduction")
        stop = res.index("Blackcoffer Insights")
    except:
        start = 0
        stop = -1
    return title.text + "\n" +res[start:stop]

fetch_web_data(random.choice(urls))[:500]

'Rise of OTT platform and its impact on entertainment industry by the year 2030\n The advancement in technology and proliferation of the Internet has led to the wide exposure of entertainment platforms to a massive audience including teenagers, adults, and senile. In this research paper, we study the impact of Over-the-Top media on society as a whole especially after the pandemic when there was a sudden shift in remote work culture, educational practices being held online, families spending valuab'

In [6]:
def get_stop_words():
    StopWords_notNames = []
    for file in os.listdir("StopWords/StopWords"):
        if file != "StopWords_Names.txt":
            corpus = open(f"StopWords/StopWords/{file}", "r").read().strip(" ").split("\n")
            res = []
            for txt in corpus:
                if "|" in txt:
                    res.extend(txt.replace(" | ", ",").replace(" ", "").split(","))
            if res != []:
                StopWords_notNames.extend(res)

    StopWords_Names = []
    for file in os.listdir("StopWords/StopWords"):
        if file == "StopWords_Names.txt":
            corpus = open(f"StopWords/StopWords/{file}", "r").read().strip(" ").split("\n")
            for txt in corpus:
                if "|" in txt:
                    res = txt.replace(" | ", ",").replace(" ", "").split(",")
                    if res != None:
                        StopWords_Names.append(res[0])

    stop_words = []
    for file in os.listdir("StopWords/StopWords"):
        corpus = open(f"StopWords/StopWords/{file}", "r").read().strip().split("\n")
        res = []
        for txt in corpus:
            if "|" in txt:
                txt = txt.replace(txt, txt.split("|")[0])
                res.append(txt.strip())
        if res != []:
            stop_words.extend(res)
        stop_words.extend([txt for txt in corpus if "|" not in txt])

    stop_words.extend(StopWords_notNames)
    stop_words.extend(StopWords_Names)
    return stop_words

get_stop_words()[:10]

['ERNST',
 'YOUNG',
 'DELOITTE',
 'TOUCHE',
 'KPMG',
 'PRICEWATERHOUSECOOPERS',
 'PRICEWATERHOUSE',
 'COOPERS',
 'AFGHANI',
 'ARIARY']

In [35]:
def clean_stop_words(text, personalwords = True):
    stop_words = get_stop_words()
    if personalwords == True:
        stop_words.extend(nltk_sw.words('english'))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(cleaned_words)
    cleaned_text = ' '.join(re.findall("[a-zA-Z.]+", cleaned_text))
    return cleaned_text

clean_stop_words(fetch_web_data(random.choice(urls)))[:500]

'Rise Cyber Crime Effects Cybercrime discussed problem twenty first century. usage cell phones internet increasing dramatically world generating questions consumers security privacy. this users understand cybercrime security. Cybercrime defined organised criminal conduct carried attackers online. Cybercrime numerous forms fraud computer viruses cyberstalking others. Due these businesses government organisations spending maintaining employing professionals cybercrime. Cyber security keywords Artif'

## Scores

In [36]:
def get_scores(text):
    def get_subjectivity_score(text):
        num_words = len(text.split())
        unique_words = len(set(text.split()))
        subjectivity_score = unique_words / num_words
        return subjectivity_score

    def get_polarity_score(text):
        positive_words = (
            open("MasterDictionary/MasterDictionary/positive-words.txt", "r").read().split("\n")
        )
        negative_words = (
            open("MasterDictionary/MasterDictionary/negative-words.txt", "r").read().split("\n")
        )

        positive_count = 0
        negative_count = 0

        for word in text.split():
            if word.lower() in positive_words:
                positive_count += 1
            elif word.lower() in negative_words:
                negative_count += 1

        polarity_score = (positive_count - negative_count) / (
            positive_count + negative_count + 1
        )
        return polarity_score, positive_count, negative_count

    subjectivity_score = get_subjectivity_score(text)
    polarity_score, positive_count, negative_count = get_polarity_score(text)

    return positive_count, negative_count, polarity_score, subjectivity_score

get_scores(clean_stop_words(fetch_web_data(random.choice(urls))))


(31, 6, 0.6578947368421053, 0.7241379310344828)

## Readability Analysis

In [40]:
def analysis_of_readability(fetched_article):
    sentences = fetched_article.replace(' ','').split(".")
    tokens = fetched_article.split(" ")
    total_num_of_sentences = len(sentences)
    total_num_of_words = len(tokens)

    num_complex_words = 0
    for token in sentences:
        if syllables.estimate(token)>2:
            num_complex_words += 1

    average_sentence_length = total_num_of_words / total_num_of_sentences
    percentage_of_complex_words = num_complex_words / total_num_of_words
    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)

    average_number_of_words_per_sentence = total_num_of_words / total_num_of_sentences

    total_syllables = sum(syllables.estimate(word) for word in sentences)
    SYLLABLE_PER_WORD = total_syllables / total_num_of_words
    SYLLABLE_PER_WORD

    return(
        num_complex_words,
        average_sentence_length,
        percentage_of_complex_words,
        fog_index,
        average_number_of_words_per_sentence,
        SYLLABLE_PER_WORD
    )

analysis_of_readability(clean_stop_words(fetch_web_data(random.choice(urls))))
            

(56, 11.0, 0.08628659476117104, 4.434514637904469, 11.0, 2.047765793528505)

## Personal Pronouns

In [43]:
def get_personal_pronouns(tokens):
    personal_pronouns = [
        "I",
        "me",
        "my",
        "mine",
        "you",
        "your",
        "yours",
        "he",
        "him",
        "his",
        "she",
        "her",
        "hers",
        "it",
        "its",
        "we",
        "us",
        "our",
        "ours",
        "they",
        "them",
        "their",
        "theirs",
    ]
    num_personal_pronouns = sum(
        [1 for word in tokens if word.lower() in personal_pronouns]
    )

    total_chars = sum(len(word) for word in tokens)
    avg_word_length = total_chars / len(tokens)
    return num_personal_pronouns, avg_word_length

corpus = clean_stop_words(fetch_web_data(random.choice(urls)),personalwords = False)

res = re.findall("[A-Za-z]+", corpus)
get_personal_pronouns(res)
    

(6, 5.982885085574572)

## Main function

In [50]:
id_r = []
url_r = []
pos_score_r = []
neg_score_r = []
polarity_score_r = []
polarity_score_r = []
subjectivity_score_r = []
average_sentence_length_r = []
percentage_of_complex_words_r = []
fog_index_r = []
average_number_of_words_per_sentence_r = []
num_complex_words_r = []
total_num_of_words_r = []
SYLLABLE_PER_WORD_r = []
num_personal_pronouns_r = []
avg_word_length_r = []

for n in range(len(urls)):
    try:
        # fetch_web_data
        fetched_article = fetch_web_data(urls[n])
    except:
        print(f"Page {urls[n]} Not Found....!")
        continue
    index = df.iloc[n]
    id_ = index[0]
    #url_ = index[1]

    # clean_stop_words
    tokens = clean_stop_words(fetched_article)
    total_num_of_words = len(tokens)

    pos_score, neg_score, polarity_score, subjectivity_score = get_scores(tokens)

    (
        num_complex_words,
        average_sentence_length,
        percentage_of_complex_words,
        fog_index,
        average_number_of_words_per_sentence,
        SYLLABLE_PER_WORD,
    ) = analysis_of_readability(fetched_article)

    tmp=clean_stop_words(fetched_article,personalwords=False)
    res = re.findall("[A-Za-z]+", tmp)
    num_personal_pronouns, avg_word_length = get_personal_pronouns(res)

    id_r.append(id_)
    pos_score_r.append(pos_score)
    neg_score_r.append(neg_score)
    polarity_score_r.append(polarity_score)
    subjectivity_score_r.append(subjectivity_score)
    average_sentence_length_r.append(average_sentence_length)
    percentage_of_complex_words_r.append(percentage_of_complex_words)
    fog_index_r.append(fog_index)
    average_number_of_words_per_sentence_r.append(average_number_of_words_per_sentence)
    num_complex_words_r.append(num_complex_words)
    total_num_of_words_r.append(total_num_of_words)
    SYLLABLE_PER_WORD_r.append(SYLLABLE_PER_WORD)
    num_personal_pronouns_r.append(num_personal_pronouns)
    avg_word_length_r.append(avg_word_length)

output = {
    "URL_ID": id_r,
    "POSITIVE SCORE": pos_score_r,
    "NEGATIVE SCORE": neg_score_r,
    "POLARITY SCORE": polarity_score_r,
    "SUBJECTIVITY SCORE": subjectivity_score_r,
    "AVG SENTENCE LENGTH": average_sentence_length_r,
    "PERCENTAGE OF COMPLEX WORDS": percentage_of_complex_words_r,
    "FOG INDEX": fog_index_r,
    "AVG NUMBER OF WORDS PER SENTENCE": average_number_of_words_per_sentence_r,
    "COMPLEX WORD COUNT": num_complex_words_r,
    "WORD COUNT ": total_num_of_words_r,
    "SYLLABLE PER WORD ": SYLLABLE_PER_WORD_r,
    "PERSONAL PRONOUNS": num_personal_pronouns_r,
    "AVG WORD LENGTH": avg_word_length_r,
}

output_df = pd.DataFrame(output).set_index("URL_ID")
output_df
    





Page https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ Not Found....!
Page https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ Not Found....!


Unnamed: 0_level_0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
blackassign0001,31,6,0.657895,0.510309,15.150000,0.065182,6.086073,15.150000,79,4384,1.599010,0,6.556338
blackassign0002,58,25,0.392857,0.658182,17.792683,0.056203,7.139554,17.792683,82,6873,1.904044,2,7.251519
blackassign0003,38,22,0.262295,0.590417,18.368421,0.053486,7.368763,18.368421,56,5877,2.154728,1,7.998454
blackassign0004,34,70,-0.342857,0.718153,19.961538,0.050096,8.004654,19.961538,52,5620,2.089595,0,7.945338
blackassign0005,23,8,0.468750,0.748092,16.536585,0.060472,6.638823,16.536585,41,3253,1.840708,0,7.159898
...,...,...,...,...,...,...,...,...,...,...,...,...,...
blackassign0096,28,51,-0.287500,0.694352,21.150943,0.045495,8.478575,21.150943,51,4882,1.796610,1,7.021559
blackassign0097,22,33,-0.196429,0.648421,26.875000,0.036279,10.764512,26.875000,39,3577,1.560000,1,6.436975
blackassign0098,5,3,0.222222,0.728395,15.480000,0.056848,6.214739,15.480000,22,1987,1.839793,0,6.884000
blackassign0099,16,2,0.736842,0.703488,18.285714,0.054688,7.336161,18.285714,35,2585,1.685938,2,6.418605


## Exporting the output

In [51]:
output_df.to_excel("Output.xlsx")