### importing the library that we required

In [2]:
import numpy as np
import re
import os
import pandas as pd 
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from urllib.request import urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests

In [3]:
stop_word_file = './StopWords/StopWords_Generic.txt'

In [4]:
positive_words_file = './MasterDictionary/positive-words.txt'

In [5]:
negative_word_file = './MasterDictionary/negative-words.txt'

In [7]:
pd.set_option('display.max_colwidth', None)
input = pd.read_excel('./Input.xlsx')
input.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/
1,38,https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/
2,39,https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/
3,40,https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/
4,41,https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/


In [8]:
urls = input['URL']

### Extrecting article title and article text from teh URLs of the data using BeautifulSoup

In [50]:
text = []
titles = []
for url in urls:
    
    page = requests.get(url, headers = {"User-Agent":"XY"})
    soup = BeautifulSoup(page.text , 'html.parser')

    
    try:
        s_title = soup . find('h1').get_text()
    except Exception:
        s_title = ""
    
    titles.append(s_title)
    
    try:
        s_text = soup . find(attrs = { 'class' : 'td-post-content'}).get_text()
    except Exception:
        s_text = ""
    
    text.append(s_text)

text[:5]

['\n\nIntroduction\n“If anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war. Not missiles but microbes.” Bill Gates’s remarks at a TED conference in 2014, right after the world had avoided the Ebola outbreak. When the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population. This public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities. For the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research.\nAfter the first case was detected in China on December 31st 2019, it was an AI program developed by BlueDot that alerted the world about the pandemic. It was quick to realise AI’s ability to analyse large chunks of data could help in de

### Cleaning article texts using Stop Words Lists
The Stop Words Lists (found in the folder StopWords) are used to clean the text so that Sentiment Analysis can be performed by excluding the words found in Stop Words List. 
* removing the stop words (using stopwords class of nltk package).
* removing any punctuations like ? ! , . from the word before counting.

In [51]:
clean_text = []
for i in range(len(text)):
    clean_text.append(text[i].replace('\n', ' '))

clean_text[:5]

['  Introduction “If anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war. Not missiles but microbes.” Bill Gates’s remarks at a TED conference in 2014, right after the world had avoided the Ebola outbreak. When the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population. This public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities. For the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research. After the first case was detected in China on December 31st 2019, it was an AI program developed by BlueDot that alerted the world about the pandemic. It was quick to realise AI’s ability to analyse large chunks of data could help in detect

### Saveing the extracted article title and article text in a text file with URL_ID as its file name.

In [78]:
for i in range(len(clean_text)):
    with open(r"./text_files/"+str(input.URL_ID[i])+".txt", "w", encoding="utf-8" ) as file:

        file.write(titles[i] + "\n" +clean_text[i])

In [52]:
with open(positive_words_file, "r") as pos_file:
    positive_words = pos_file.read().lower()
positive_word_list = positive_words.split('\n')

positive_word_list[:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [53]:
with open(negative_word_file, 'r', encoding="ISO-8859-1") as neg_file:
    negative_words= neg_file.read().lower()
negative_word_list = negative_words.split('\n')

negative_word_list[:5]

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']

In [54]:
with open('./StopWords/StopWords_Generic.txt','r') as stop_word_file:
    stop_words = stop_word_file.read().lower()
stop_word_list = stop_words.split('\n')

stop_word_list[:5]

['about', 'above', 'after', 'again', 'all']

In [55]:
def Tokenizer(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = list(filter(lambda token: token not in stop_word_list, tokens))
    return filtered_words

#### Data Analysis
For each of the extracted texts from the article, perform textual analysis and compute variables, given in the output structure excel file. 

In [56]:
words_list = []
    
for i in range(len(clean_text)):
    word = Tokenizer(clean_text[i])
    words_list.append(word)

words_list[:1]

[['introduction',
  'anything',
  'kills',
  '10',
  'million',
  'people',
  'next',
  'decades',
  'will',
  'a',
  'highly',
  'infectious',
  'virus',
  'rather',
  'a',
  'war',
  'missiles',
  'microbes',
  'bill',
  'gates',
  's',
  'remarks',
  'a',
  'ted',
  'conference',
  '2014',
  'right',
  'world',
  'avoided',
  'ebola',
  'outbreak',
  'new',
  'unprecedented',
  'invisible',
  'virus',
  'hit',
  'us',
  'met',
  'overwhelmed',
  'unprepared',
  'healthcare',
  'system',
  'oblivious',
  'population',
  'public',
  'health',
  'emergency',
  'demonstrated',
  'lack',
  'scientific',
  'consideration',
  'underlined',
  'alarming',
  'need',
  'robust',
  'innovations',
  'health',
  'medical',
  'facilities',
  'past',
  'years',
  'artificial',
  'intelligence',
  'proven',
  'tangible',
  'potential',
  'healthcare',
  'sectors',
  'clinical',
  'practices',
  'translational',
  'medical',
  'biomedical',
  'research',
  'first',
  'case',
  'detected',
  'china',


#### Word Count
We count the total number of cleaned words present in the text

In [57]:
words_count = []
for i in range(len(words_list)):
    word_count = len(words_list[i])
    words_count.append(word_count)

In [58]:
words_count[:5]

[1205, 830, 1055, 973, 1101]

#### Positive Score: 
This score is calculated by assigning the value of +1 for each word if found in the Positive Dictionary and then adding up all the values.

In [59]:
positive_score=[] 
for i in range(len(words_list)):
    pos_word = 0
    for word in words_list[i]:
        if word in positive_word_list:
            pos_word +=1
    positive_score.append(pos_word)


positive_score[:5]

[82, 72, 74, 86, 71]

#### Negative Score: 
This score is calculated by assigning the value of -1 for each word if found in the Negative Dictionary and then adding up all the values. We multiply the score with -1 so that the score is a positive number.

In [60]:
negative_score=[] 
for i in range(len(words_list)):
    neg_word = 0
    for word in words_list[i]:
        if word in negative_word_list:
            neg_word +=1
    negative_score.append(neg_word)

negative_score[:5]

[36, 37, 36, 28, 29]

### Polarity Score: 
This is the score that determines if a given text is positive or negative in nature. It is calculated by using the formula: 
Polarity Score = (Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001)

In [61]:
polarity_score = []
for i in range(len(positive_score)):
    pol_score = (positive_score[i] - negative_score[i])/((positive_score[i] + negative_score[i])+0.000001)
    polarity_score.append(pol_score)

polarity_score[:5]

[0.38983050517092793,
 0.3211009144853127,
 0.34545454231404965,
 0.5087719253616498,
 0.4199999958]

## Subjectivity Score:
This is the score that determines if a given text is objective or subjective. It is calculated by using the formula: 
Subjectivity Score = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001)

In [62]:
subjectivity_score = []
for i in range(len(positive_score)):
    sub_score = ((positive_score[i] + negative_score[i])/ ((words_count[i]) + 0.000001))
    subjectivity_score.append(sub_score)

subjectivity_score[:5]

[0.09792531112205367,
 0.131325301046596,
 0.10426540274477213,
 0.1171634120070263,
 0.09082652126173793]

#### Counting sentences in the Article

In [67]:
sentence_count=[]
for i in range(len(clean_text)):
    sentences =  sent_tokenize(clean_text[i], language="english")
    sentence_count.append(len(sentences))

sentence_count[:5]

[77, 80, 85, 95, 79]

## Analysis of Readability
#### Analysis of Readability is calculated using the Gunning Fox index formula described below.
Average Sentence Length = the number of words / the number of sentences

In [68]:
average_sentence_lenght=[]
for i in range(len(words_count)):
    sent_count = sentence_count[i]
    if sent_count > 0 : 
        avg_sent_len = round(words_count[i] / sentence_count[i])
        average_sentence_lenght.append(avg_sent_len)
    else:
        avg_sent_len = 0
        average_sentence_lenght.append(avg_sent_len)

average_sentence_lenght[:5]

[16, 10, 12, 10, 14]

### Complex Word Count
Complex words are words in the text that contain more than two syllables.

In [69]:
complex_word_count =[]
for i in range(len(words_list)):
    complexWord = 0

    for word in words_list[i]:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complexWord += 1
    complex_word_count.append(complexWord)

complex_word_count[:5]

[524, 245, 457, 336, 369]

### Calculating Percentage of Complex word
Percentage of Complex words = the number of complex words / the number of words 

In [70]:
complex_word_percentage = []

for i in range(len(words_count)):
    if words_count[i] > 0 :
        complex_word_percent = complex_word_count[i]/words_count[i]
        complex_word_percentage.append(complex_word_percent)
    else:
        complex_word_percent = 0
        complex_word_percentage.append(complex_word_percent)

complex_word_percentage[:5]

[0.4348547717842324,
 0.29518072289156627,
 0.43317535545023694,
 0.34532374100719426,
 0.335149863760218]

### Calculating Fog Index
Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

In [71]:
fog_index = []
for i in range(len(average_sentence_lenght)):
    fogIndex = 0.4 * (average_sentence_lenght[i] + complex_word_percentage[i])
    fog_index.append(fogIndex)

fog_index[:5]

[6.573941908713692,
 4.118072289156626,
 4.973270142180095,
 4.138129496402878,
 5.734059945504088]

### Syllable Count Per Word
We count the number of Syllables in each word of the text by counting the vowels present in each word. We also handle some exceptions like words ending with "es","ed" by not counting them as a syllable.


In [72]:
syllable_count = []
for i in range(len(words_list)):
    count = 0
    for j in range(len(words_list[i])):        
        vowels = 'aeiouy'
        word = words_list[i][j].strip(".:;?!")
        if word[0] in vowels:
            count +=1
        for index in range(1,len(word)):
            if word[index] in vowels and word[index-1] not in vowels:
                count +=1
        if word.endswith('es'):
            count -= 1
        elif word.endswith('ed'):
            count-=1
        elif count == 0:
            count +=1
    syllable_count.append(count)

syllable_count[:5]

[2750, 1632, 2475, 2034, 2254]

### Personal Pronouns
 To calculate Personal Pronouns mentioned in the text, we use regex to find the counts of the words - “I,” “we,” “my,” “ours,” and “us”. Special care is taken so that the country name US is not included in the list.


In [73]:
presonal_pronoun = []
for i in range(len(clean_text)):
    pronounRegex = re.compile(r'I|we|my|ours|us',re.I)
    pronouns = pronounRegex.findall(clean_text[i])
    presonal_pronoun.append(len(pronouns))

presonal_pronoun[:5]

[933, 600, 875, 684, 760]

## Average Word Length
##### Average Word Length is calculated by the formula:
Sum of the total number of characters in each word/Total number of words

In [74]:
char_count=[]
for i in range(len(clean_text)):
    char = clean_text[i].replace(' ', '')
    char = len(char)
    char_count.append(char)


avg_word_len = []
for i in range(len(char_count)):
    if words_count[i] == 0 | char_count[i]==0 :
        AWC = 0
        avg_word_len.append(AWC)
    else:
        AWC = char_count[i]/words_count[i]
        avg_word_len.append(round(AWC))

avg_word_len[:5]

[9, 9, 9, 8, 8]

### Output Data Structure
#### Output Variables: 
All input variables in “Input.xlsx”
* POSITIVE SCORE
* NEGATIVE SCORE
* POLARITY SCORE
* SUBJECTIVITY SCORE
* AVG SENTENCE LENGTH
* PERCENTAGE OF COMPLEX WORDS
* FOG INDEX
* AVG NUMBER OF WORDS PER SENTENCE
* COMPLEX WORD COUNT
* WORD COUNT
* SYLLABLE PER WORD
* PERSONAL PRONOUNS
* AVG WORD LENGTH

In [75]:
input['POSITIVE SCORE'] = positive_score
input['NEGATIVE SCORE'] = negative_score
input['POLARITY SCORE'] = polarity_score
input['SUBJECTIVITY SCORE'] = subjectivity_score
input['AVG SENTENCE LENGTH'] = average_sentence_lenght
input['PERCENTAGE OF COMPLEX WORDS'] = complex_word_percentage
input['FOG INDEX'] = fog_index
input['AVG NUMBER OF WORDS PER SENTENCE'] = average_sentence_lenght
input['COMPLEX WORD COUNT'] = complex_word_count
input['WORD COUNT'] = words_count
input['SYLLABLE PER WORD'] = syllable_count
input['PERSONAL PRONOUNS'] = presonal_pronoun
input['AVG WORD LENGTH'] = avg_word_len


In [76]:
input.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/,82,36,0.389831,0.097925,16,0.434855,6.573942,16,524,1205,2750,933,9
1,38,https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/,72,37,0.321101,0.131325,10,0.295181,4.118072,10,245,830,1632,600,9
2,39,https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/,74,36,0.345455,0.104265,12,0.433175,4.97327,12,457,1055,2475,875,9
3,40,https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/,86,28,0.508772,0.117163,10,0.345324,4.138129,10,336,973,2034,684,8
4,41,https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/,71,29,0.42,0.090827,14,0.33515,5.73406,14,369,1101,2254,760,8


### Now we have to save the output in the exact order as given in the output structure file, “Output Data Structure.xlsx”
 All input variables in “Output_Data_Structure.xlsx”

In [77]:
input.to_excel("Output_Data_Structure.xlsx")