In [1]:
import os
import random
import re
import warnings

import pandas as pd
import requests
import syllables
from bs4 import BeautifulSoup
from nltk.corpus import stopwords as nltk_sw

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel("Input.xlsx")
urls = list(df["URL"])
urls[:10], len(urls)

(['https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/',
  'https://insights.blackcoffer.com/rise-of-e-health-and-its-impact-on-humans-by-the-year-2030/',
  'https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/',
  'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2/',
  'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2-2/',
  'https://insights.blackcoffer.com/rise-of-chatbots-and-its-impact-on-customer-support-by-the-year-2040/',
  'https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030/',
  'https://insights.blackcoffer.com/how-does-marketing-influence-businesses-and-consumers/',
  'https://insights.blackcoffer.com/how-advertisement-increase-your-market-value/',
  'https://insights.blackcoffer.com/negative-effects-of-marketing-on-society/'],
 114)

# fetch_web_data

In [3]:
def fetch_web_data(url):
    class_ = ["td-post-content tagdiv-type", "tdb-block-inner td-fix-index"]
    doc = requests.get(url)
    soup = BeautifulSoup(doc.content, "html.parser")
    title = soup.find("h1")
    article = soup.find_all("div", {"class": class_[0]})
    if article:
        res = " "
        for tag in article:
            res += tag.text.strip()
    else:
        article = soup.find_all("div", {"class": class_[1]})
        res = " "
        for tag in article:
            res += tag.text.strip()
    try:
        start = res.index("Introduction")
        stop = res.index("Blackcoffer Insights")
    except:
        start = 0
        stop = -1
    return title.text + "\n" + res[start:stop]


fetch_web_data(random.choice(urls))[:500]

'Oil prices by the year 2040, and how it will impact the world economy.\n We are in an interconnected world. Any change in one part of the world will always lead to some #changes in other parts of the world as well, maybe a bit later but surely there will be some change and that is what we are seeing in today’s world. Electric vehicles are the change that we are seeing in today’s world. With so many advancements in technology, economies are getting bigger. China might surpass the US and become a #'

# Stop Words

In [13]:
def get_stop_words():
    StopWords_notNames = []
    for file in os.listdir("StopWords/"):
        if file != "StopWords_Names.txt":
            corpus = open(f"StopWords/{file}", "r").read().strip(" ").split("\n")
            res = []
            for txt in corpus:
                if "|" in txt:
                    res.extend(txt.replace(" | ", ",").replace(" ", "").split(","))
            if res != []:
                StopWords_notNames.extend(res)

    StopWords_Names = []
    for file in os.listdir("StopWords/"):
        if file == "StopWords_Names.txt":
            corpus = open(f"StopWords/{file}", "r").read().strip(" ").split("\n")
            for txt in corpus:
                if "|" in txt:
                    res = txt.replace(" | ", ",").replace(" ", "").split(",")
                    if res != None:
                        StopWords_Names.append(res[0])

    stop_words = []
    for file in os.listdir("StopWords/"):
        corpus = open(f"StopWords/{file}", "r").read().strip().split("\n")
        res = []
        for txt in corpus:
            if "|" in txt:
                txt = txt.replace(txt, txt.split("|")[0])
                res.append(txt.strip())
        if res != []:
            stop_words.extend(res)
        stop_words.extend([txt for txt in corpus if "|" not in txt])

    stop_words.extend(StopWords_notNames)
    stop_words.extend(StopWords_Names)
    return stop_words


get_stop_words()[:10]

['ERNST',
 'YOUNG',
 'DELOITTE',
 'TOUCHE',
 'KPMG',
 'PRICEWATERHOUSECOOPERS',
 'PRICEWATERHOUSE',
 'COOPERS',
 'AFGHANI',
 'ARIARY']

In [15]:
def clean_stop_words(text, personalwords=True):
    stop_words=get_stop_words()
    if personalwords == True:
        stop_words.extend(nltk_sw.words("english"))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = " ".join(cleaned_words)
    cleaned_text = " ".join(re.findall("[a-zA-Z.]+", cleaned_text))
    return cleaned_text


clean_stop_words(fetch_web_data(random.choice(urls)))[:500]

'machine replace human future work Introduction disruptive technology taking us leave it disruptive technology creates jobs depleted jobs. notice jobs disappearing jobs jobs transform humans robots machines technology creating machines replace them. Technology creates data analysis tools manipulate create custom scenarios artificial intelligence AI Big Data Machine Learning ML algorithms predict drive consumer behavior. Data Analytics tools Google Analytics today free and correctly organizations '

# Scores

In [16]:
def get_scores(text):
    def get_subjectivity_score(text):
        num_words = len(text.split())
        unique_words = len(set(text.split()))
        subjectivity_score = unique_words / num_words
        return subjectivity_score

    def get_polarity_score(text):
        positive_words = (
            open("MasterDictionary/positive-words.txt", "r").read().split("\n")
        )
        negative_words = (
            open("MasterDictionary/negative-words.txt", "r").read().split("\n")
        )

        positive_count = 0
        negative_count = 0

        for word in text.split():
            if word.lower() in positive_words:
                positive_count += 1
            elif word.lower() in negative_words:
                negative_count += 1

        polarity_score = (positive_count - negative_count) / (
            positive_count + negative_count + 1
        )
        return polarity_score, positive_count, negative_count

    subjectivity_score = get_subjectivity_score(text)
    polarity_score, positive_count, negative_count = get_polarity_score(text)

    return positive_count, negative_count, polarity_score, subjectivity_score


get_scores(clean_stop_words(fetch_web_data(random.choice(urls))))

(25, 0, 0.9615384615384616, 0.6352941176470588)

# Analysis_of_readability

In [17]:
def Analysis_of_readability(fetched_article):
    sentences = fetched_article.replace(" ", "").split(".")
    tokens = fetched_article.split(" ")
    total_num_of_sentences = len(sentences)
    total_num_of_words = len(tokens)

    num_complex_words = 0
    for token in sentences:
        if syllables.estimate(token) > 2:
            num_complex_words += 1

    Average_Sentence_Length = total_num_of_words / total_num_of_sentences
    Percentage_of_Complex_words = num_complex_words / total_num_of_words
    Fog_Index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)

    Average_Number_of_Words_Per_Sentence = total_num_of_words / total_num_of_sentences

    total_syllables = sum(syllables.estimate(word) for word in sentences)
    SYLLABLE_PER_WORD = total_syllables / total_num_of_words
    SYLLABLE_PER_WORD

    return (
        num_complex_words,
        Average_Sentence_Length,
        Percentage_of_Complex_words,
        Fog_Index,
        Average_Number_of_Words_Per_Sentence,
        SYLLABLE_PER_WORD,
    )


Analysis_of_readability(clean_stop_words(fetch_web_data(random.choice(urls))))

(77,
 13.73076923076923,
 0.0718954248366013,
 5.521065862242333,
 13.73076923076923,
 2.4379084967320264)

# get_personal_pronouns

In [19]:
def get_personal_pronouns(tokens):
    personal_pronouns = [
        "I",
        "me",
        "my",
        "mine",
        "you",
        "your",
        "yours",
        "he",
        "him",
        "his",
        "she",
        "her",
        "hers",
        "it",
        "its",
        "we",
        "us",
        "our",
        "ours",
        "they",
        "them",
        "their",
        "theirs",
    ]
    num_personal_pronouns = sum(
        [1 for word in tokens if word.lower() in personal_pronouns]
    )

    total_chars = sum(len(word) for word in tokens)
    avg_word_length = total_chars / len(tokens)

    return num_personal_pronouns, avg_word_length


corpus = clean_stop_words(fetch_web_data(random.choice(urls)),personalwords=False)

res = re.findall("[A-Za-z]+", corpus)
get_personal_pronouns(res)

(1, 6.93010752688172)

<!-- # 1. All input variables in “Input.xlsx” - columns
# 2. POSITIVE SCORE - pos_score
# 3. NEGATIVE SCORE - neg_score
# 4. POLARITY SCORE - Polarity_Score
# 5. SUBJECTIVITY SCORE - Subjectivity_Score
# 6. AVG SENTENCE LENGTH - Average_Sentence_Length
# 7. PERCENTAGE OF COMPLEX WORDS - Percentage_of_Complex_words
# 8. FOG INDEX - Fog_Index
# 9. AVG NUMBER OF WORDS PER SENTENCE - Average_Number_of_Words_Per_Sentence
# 10. COMPLEX WORD COUNT - num_complex_words
# 11. WORD COUNT - total_num_of_words
# 12. SYLLABLE PER WORD - SYLLABLE_PER_WORD
# 13. PERSONAL PRONOUNS - num_personal_pronouns
# 14. AVG WORD LENGTH - avg_word_length -->

# Main

In [20]:
id_r = []
url_r = []
pos_score_r = []
neg_score_r = []
Polarity_Score_r = []
Polarity_Score_r = []
Subjectivity_Score_r = []
Average_Sentence_Length_r = []
Percentage_of_Complex_words_r = []
Fog_Index_r = []
Average_Number_of_Words_Per_Sentence_r = []
num_complex_words_r = []
total_num_of_words_r = []
SYLLABLE_PER_WORD_r = []
num_personal_pronouns_r = []
avg_word_length_r = []


# Iterating through URLS
for n in range(len(urls)):
    try:
        # fetch_web_data
        fetched_article = fetch_web_data(urls[n])
    except:
        print(f"Page {urls[n]} Not Found....!")
        continue
    index = df.iloc[n]
    id_ = index[0]
    url_ = index[1]

    # clean_stop_words
    tokens = clean_stop_words(fetched_article)
    total_num_of_words = len(tokens)

    pos_score, neg_score, Polarity_Score, Subjectivity_Score = get_scores(tokens)

    (
        num_complex_words,
        Average_Sentence_Length,
        Percentage_of_Complex_words,
        Fog_Index,
        Average_Number_of_Words_Per_Sentence,
        SYLLABLE_PER_WORD,
    ) = Analysis_of_readability(fetched_article)

    tmp=clean_stop_words(fetched_article,personalwords=False)
    res = re.findall("[A-Za-z]+", tmp)
    num_personal_pronouns, avg_word_length = get_personal_pronouns(res)

    # Appending obtained variables into respective lists
    id_r.append(id_)
    url_r.append(url_)
    pos_score_r.append(pos_score)
    neg_score_r.append(neg_score)
    Polarity_Score_r.append(Polarity_Score)
    Subjectivity_Score_r.append(Subjectivity_Score)
    Average_Sentence_Length_r.append(Average_Sentence_Length)
    Percentage_of_Complex_words_r.append(Percentage_of_Complex_words)
    Fog_Index_r.append(Fog_Index)
    Average_Number_of_Words_Per_Sentence_r.append(Average_Number_of_Words_Per_Sentence)
    num_complex_words_r.append(num_complex_words)
    total_num_of_words_r.append(total_num_of_words)
    SYLLABLE_PER_WORD_r.append(SYLLABLE_PER_WORD)
    num_personal_pronouns_r.append(num_personal_pronouns)
    avg_word_length_r.append(avg_word_length)


output = {
    "URL_ID": id_r,
    "POSITIVE SCORE": pos_score_r,
    "NEGATIVE SCORE": neg_score_r,
    "POLARITY SCORE": Polarity_Score_r,
    "SUBJECTIVITY SCORE": Subjectivity_Score_r,
    "AVG SENTENCE LENGTH": Average_Sentence_Length_r,
    "PERCENTAGE OF COMPLEX WORDS": Percentage_of_Complex_words_r,
    "FOG INDEX": Fog_Index_r,
    "AVG NUMBER OF WORDS PER SENTENCE": Average_Number_of_Words_Per_Sentence_r,
    " COMPLEX WORD COUNT": num_complex_words_r,
    "WORD COUNT ": total_num_of_words_r,
    "SYLLABLE PER WORD ": SYLLABLE_PER_WORD_r,
    "PERSONAL PRONOUNS": num_personal_pronouns_r,
    "AVG WORD LENGTH": avg_word_length_r,
}

output_df = pd.DataFrame(output).set_index("URL_ID")
output_df

Page https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ Not Found....!
Page https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ Not Found....!


Unnamed: 0_level_0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
123.0,75,21,0.556701,0.398870,18.896552,0.052311,7.579545,18.896552,86,7746,2.003041,2,7.639233
321.0,38,12,0.509804,0.534426,24.760000,0.040388,9.920155,24.760000,25,2675,1.854604,1,7.695082
2345.0,22,20,0.046512,0.649266,15.554054,0.063423,6.246991,15.554054,73,5022,1.771503,0,7.168595
4321.0,34,26,0.131148,0.670504,20.576271,0.048600,8.249948,20.576271,59,5702,1.894563,1,7.153179
432.0,34,26,0.131148,0.670504,20.576271,0.048600,8.249948,20.576271,59,5702,1.894563,1,7.153179
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50921.0,5,27,-0.666667,0.729592,19.363636,0.050078,7.765486,19.363636,32,3024,1.902973,0,6.738342
51382.8,24,59,-0.416667,0.532274,30.450980,0.032840,12.193528,30.450980,51,7526,1.912428,0,6.444223
51844.6,81,28,0.481818,0.698492,23.972222,0.041136,9.605343,23.972222,71,7762,1.792584,10,6.723896
52306.4,32,22,0.181818,0.620419,19.921875,0.047059,7.987574,19.921875,60,5687,1.879216,7,6.345953


# Exporting the output_df to excel

In [21]:
output_df.to_excel("Output Data.xlsx")