In [9]:
import numpy as np
import re
import os
import pandas as pd 
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from urllib.request import urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests


In [10]:
stop_word_file = '/blackcoffer/StopWords/StopWords_Generic.txt'

In [11]:
positive_words_file = '/blackcoffer/MasterDictionary/positive-words.txt'

In [12]:
negative_word_file = '/blackcoffer/MasterDictionary/negative-words.txt'

In [13]:
pd.set_option('display.max_colwidth', None)

In [14]:
input = pd.read_excel('/blackcoffer/Output Data Structure.xlsx')
input.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/,,,,,,,,,,,,,
1,38,https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/,,,,,,,,,,,,,
2,39,https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/,,,,,,,,,,,,,
3,40,https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/,,,,,,,,,,,,,
4,41,https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/,,,,,,,,,,,,,


In [15]:
urls = input['URL']

In [16]:
def GetArticleNames(urls):
    titles = []
    
    for i in range(len(urls)):
        title = urls[i]
        title_clean = title[title.index('m/' )+ 2 : -1].replace('-', ' ')
        titles.append(title_clean)
    return titles

In [17]:
titles = GetArticleNames(urls)

In [22]:
text = []
for url in urls:
    
    page = requests.get(url, headers = {"User-Agent":"XY"})
    soup = BeautifulSoup(page.text , 'html.parser')

    try:
        s_text = soup . find(attrs = { 'class' : 'td-post-content'}).get_text()
    except Exception:
        s_text = ""
    
    text.append(s_text)

In [23]:
text_transform = []
for i in range(len(text)):
    text_transform.append(text[i].replace('\n', ' '))

In [25]:
words = []
for url in urls:
    page = requests.get(url, headers = {"User-Agent":"XY"})
    soup = BeautifulSoup(page.text , 'html.parser')
    
    # getting Title of the web page
    try:
        title = soup . find('h1', attrs = { 'class' : 'entry-title'}).get_text()
    except Exception:
        title = "" 
    
    # getting article text 
    try:
        text = soup . find(attrs = { 'class' : 'td-post-content'}).get_text()
    except Exception:
        text = ""
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    
    # break multi-headlines into line each
    chunks = (pharse.strip() for line in lines for pharse in line.split(' '))
    
    # drop blank lines
    text = "\n".join(chunk for chunk in chunks if chunk)
    words.append(text)

In [26]:
with open(positive_words_file, "r") as pos_file:
    positive_words = pos_file.read().lower()
positive_word_list = positive_words.split('\n')

positive_word_list[:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [27]:
with open(negative_word_file, 'r', encoding="ISO-8859-1") as neg_file:
    negative_words= neg_file.read().lower()
negative_word_list = negative_words.split('\n')

negative_word_list[:5]

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']

In [28]:
with open(stop_word_file,'r') as stop_word_file:
    stop_words = stop_word_file.read().lower()
stop_word_list = stop_words.split('\n')

stop_word_list[:5]

['about', 'above', 'after', 'again', 'all']

In [29]:
def Tokenizer(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = list(filter(lambda token: token not in stop_word_list, tokens))
    return filtered_words

In [30]:
Tokenizer(text_transform[0])[:5]

['introduction', 'anything', 'kills', '10', 'million']

In [32]:
def FilterWords(text):
    words = []
    
    for i in range(len(text)):
        word = Tokenizer(text[i])
        words.append(word)
    return words

In [33]:
text_list= FilterWords(text_transform)
text_list[0][:5]

['introduction', 'anything', 'kills', '10', 'million']

In [123]:
len(text)

114

In [151]:
words_count = []
for i in range(len(text_list)):
    word_count = len(text_list[i])
    words_count.append(word_count)

In [150]:
words_count[0]

1205

In [36]:
def PositiveScore(text):
    positive_score=[] 
    for i in range(len(text)):
        pos_word = 0
        for word in text[i]:
            if word in positive_word_list:
                pos_word +=1
        positive_score.append(pos_word)
    return positive_score

In [37]:
positive_score = PositiveScore(text_list)
positive_score[0]

82

In [38]:
def NegativeScore(text):
    negative_score=[] 
    for i in range(len(text)):
        neg_word = 0
        for word in text[i]:
            if word in negative_word_list:
                neg_word +=1
        negative_score.append(neg_word)
    return negative_score

In [39]:
negative_score = NegativeScore(text_list)
negative_score[0]

36

In [40]:
def PolarityScore(positive_score, negative_score):
    polarity_score = []
    for i in range(len(positive_score)):
        pol_score = (positive_score[i] - negative_score[i])/((positive_score[i] + negative_score[i])+0.000001)
        polarity_score.append(pol_score)
    return polarity_score

In [41]:
polarity_score = PolarityScore(positive_score, negative_score)

In [42]:
polarity_score[0]

0.38983050517092793

In [43]:
def SubjectivityScore(positive_score, negative_score, total_word_count):
    subjectivity_score = []
    for i in range(len(positive_score)):
        sub_score = ((positive_score[i] + negative_score[i])/ ((total_word_count[i]) + 0.000001))
        subjectivity_score.append(sub_score)
    return subjectivity_score

In [44]:
subjectivity_score = SubjectivityScore(positive_score, negative_score, words_count)

In [45]:
subjectivity_score[0]

0.09792531112205367

In [46]:
def SentenceCount(text):
    sentence_count=[]
    for i in range(len(text)):
        sentences =  len(sent_tokenize(text[i]))
        sentence_count.append(sentences)
    return sentence_count

In [47]:
sentence_count = SentenceCount(text_transform)

In [86]:
len(sentence_count)

114

In [88]:
def AverageSentenceLength(words_count, sentence_count):
    
    Average_Sentence_Lenght=[]
    for i in range(len(words_count)):
        sent_count = sentence_count[i]
        if sent_count > 0 : 
            avg_sent_len = round(words_count[i] / sentence_count[i])
            Average_Sentence_Lenght.append(avg_sent_len)
        else:
            avg_sent_len = 0
            Average_Sentence_Lenght.append(avg_sent_len)
    return Average_Sentence_Lenght

In [89]:
average_sentence_lenght = AverageSentenceLength(words_count, sentence_count)

In [90]:
len(average_sentence_lenght)

114

In [52]:
def ComplexWordCount(text):
    complex_word_count =[]
    for i in range(len(text)):
        complexWord = 0

        for word in text[i]:
            vowels=0
            if word.endswith(('es','ed')):
                pass
            else:
                for w in word:
                    if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                        vowels += 1
                if(vowels > 2):
                    complexWord += 1
        complex_word_count.append(complexWord)
    return complex_word_count

In [53]:
complex_word_count = ComplexWordCount(text_list)

In [54]:
complex_word_count[2]

457

In [55]:
def PercentageComplexWord(complex_word_count, words_count):
    complex_word_percentage = []
    
    for i in range(len(words_count)):
        if words_count[i] > 0 :
            complex_word_percent = complex_word_count[i]/words_count[i]
        else:
            complex_word_percent = 0
        complex_word_percentage.append(complex_word_percent)
    
    return complex_word_percentage


In [56]:
percentage_complex_word = PercentageComplexWord(complex_word_count, words_count)

In [57]:
percentage_complex_word[0]

0.4348547717842324

In [58]:
def FogIndex(average_sentence_lenght, percentage_complex_word):
    fog_index = []
    for i in range(len(average_sentence_lenght)):
        fogIndex = 0.4 * (average_sentence_lenght[i] + percentage_complex_word[i])
        fog_index.append(fogIndex)
    return fog_index

In [143]:
fog_index = FogIndex(average_sentence_lenght, percentage_complex_word)

In [60]:
def SyllablesCount(text):
    syllable_count = []
    for i in range(len(text)):
        count = 0
        for j in range(len(text[i])):
            
            vowels = 'aeiouy'
            starts = ['ou','ei','ae','ea','eu','oi']
            endings = ['es','ed']
            word = text[i][j].strip(".:;?!")
            if word[0] in vowels:
                count +=1
            for index in range(1,len(word)):
                if word[index] in vowels and word[index-1] not in vowels:
                    count +=1
            if word.endswith('e'):
                count -= 1
            if word.endswith('le'):
                count+=1
            if count == 0:
                count +=1
        syllable_count.append(count)
    return syllable_count

In [154]:
syllable_count = SyllablesCount(text_list)

In [104]:
def PresonalPronoun(words):
    presonal_pronoun = []
    for i in range(len(words)):
        pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
        pronouns = pronounRegex.findall(words[i])
        presonal_pronoun.append(pronouns)
    return presonal_pronoun

In [105]:
presonal_pronoun = PresonalPronoun(text_transform)

In [119]:
char_count=[]
for i in range(len(text_transform)):
    char = text_transform[i].replace(' ', '')
    char = len(char)
    char_count.append(char)

In [123]:
char_count[7]

0

In [138]:
avg_word_count = []
for i in range(len(char_count)):
    if words_count[i] == 0 | char_count[i]==0 :
        AWC = 0
        avg_word_count.append(AWC)
    else:
        AWC = char_count[i]/words_count[i]
        avg_word_count.append(round(AWC))
    

In [139]:
avg_word_count[0]

9

In [125]:
input['POSITIVE SCORE'] = positive_score

In [78]:
input['NEGATIVE SCORE'] = negative_score

In [80]:
input['POLARITY SCORE'] = polarity_score

In [82]:
input['SUBJECTIVITY SCORE'] = subjectivity_score

In [92]:
input['PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_word

In [93]:
input['AVG SENTENCE LENGTH'] = average_sentence_lenght

In [144]:
input['FOG INDEX'] = fog_index

In [98]:
input['AVG NUMBER OF WORDS PER SENTENCE'] = average_sentence_lenght

In [99]:
input['COMPLEX WORD COUNT'] = complex_word_count

In [152]:
input['WORD COUNT'] = words_count

In [155]:
input['SYLLABLE PER WORD'] = syllable_count

In [106]:
input['PERSONAL PRONOUNS'] = presonal_pronoun

In [140]:
input['AVG WORD LENGTH'] = avg_word_count

In [158]:
input.insert(2, 'TITLE', titles)

In [159]:
input.head()

Unnamed: 0,URL_ID,URL,TITLE,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/,ai in healthcare to improve patient outcomes,82,36,0.389831,0.097925,16,0.434855,6.573942,16,524,1205,2735,[us],9
1,38,https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/,what if the creation is taking over the creator,72,37,0.321101,0.131325,10,0.295181,4.118072,10,245,830,1642,"[we, us, us, i, us, we, we]",9
2,39,https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/,what jobs will robots take from humans in the future,74,36,0.345455,0.104265,12,0.433175,4.97327,12,457,1055,2465,"[We, us, us]",9
3,40,https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/,will machine replace the human in the future of work,86,28,0.508772,0.117163,10,0.345324,4.138129,10,336,973,1952,"[us, we, us, we, we, we, we, we, we, we, we, we, we, we, we, we, we]",8
4,41,https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/,will ai replace us or work with us,71,29,0.42,0.090827,14,0.33515,5.73406,14,369,1101,2209,"[us, We, we, we, we, we, us, us, we, we, We, We, us, We, us, us]",8


In [157]:
input.to_excel("Output Data Structure.xlsx")