## Importing all the necessary libraries

In [18]:
import requests
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import sys
import re
import string
import os
import nltk
nltk.download('punkt')  #Run this only one time to download model in system
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/anonymous/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Loading  input data

In [26]:
path = str(os.getcwd())
input_data = pd.read_excel(str(path)+'/Input.xlsx')
med_data = pd.read_excel(str(path)+'/Input.xlsx')

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


## Reading the title and text of the articles from url and saving in a text file

In [17]:
for i in range(0,len(input_data)):
    url = dict(input_data['URL'][i:i+1])
    page = requests.get(url[i], headers = {'User-Agent': 'Chrome'}) # Here we get the html contents of web page
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find('title')  # This will get the title of the article
    title = title.text
    file_name = list(input_data['URL_ID'][i:i+1])
    f = open('files/'+str(file_name[0]) + '.txt', 'w+', encoding = 'utf-8') # here we are creating text file wit url_id as its name and writing title and paragragh in the file
    f.write(title +'\n')
    for para in soup.find_all('p'):   # This will get all the paragraphs in the article
        f.write(para.text)
    f.close()

## Creating a list of all the stopwords

In [27]:
final_stopwords = []
# Here we crete lists of all the stop words from differnt files and then creating a final list containing all the stopword lists.
stopwords_files = ['Auditor','Currencies','DatesandNumbers','Generic','GenericLong','Geographic','Names']
for i in range(0,len(stopwords_files)):
    f = open(str(path)+'/StopWords/StopWords_'+stopwords_files[i]+'.txt','r',encoding='latin-1')
    stopwords_files[i] = str(f.read())
    stopwords_files[i] = stopwords_files[i].replace('\n',',')
    str_found = re.findall(r'\|(.*?)[\n|,]', stopwords_files[i])  # filtering the text file
    stopwords_files[i] = stopwords_files[i].replace('|','')
    stopwords_files[i] = stopwords_files[i].lower()
    for j in str_found:                                             # Removing unnecessary items.
        if j in stopwords_files[i]:
            stopwords_files[i] = stopwords_files[i].replace(j,'')
        else:
            continue
    stopwords_files[i] = list(stopwords_files[i].split(','))   
    final_stopwords+=(stopwords_files[i])    # Final stopwords list

## Creating a list of positive and negative words

In [10]:
masterwords_files = ['negative','positive']
positive = ''
negative = ''
for i in range(0, len(masterwords_files)):
    f = open(str(path)+'/MasterDictionary/'+masterwords_files[i]+'-words.txt','r',encoding='latin-1')
    if i == 0:
        negative = str(f.read())
        negative = negative.lower()
        negative = list(negative.split())   # Negative words list
    else:
        positive = str(f.read())
        positive = positive.lower()
        positive = list(positive.split())   # Positive words lists
    f.close() 

## Storing article text in med df

In [16]:
data = pd.Series(dtype = 'str')
for i in range(0,len(input_data)):
    file_name = list(input_data['URL_ID'][i:i+1])
    f = open('files/'+str(file_name[0]) + '.txt', 'r+', encoding = 'utf-8')
    article = str(f.read())
    article = article.lower()
    m = pd.Series(article)
    data = pd.concat([data, m])
    f.close()
data.index = [i for i in range(0,len(input_data))]
med_data['articles'] = data


Unnamed: 0,URL_ID,URL,articles
0,37,https://insights.blackcoffer.com/ai-in-healthc...,ai in healthcare to improve patient outcomes -...
1,38,https://insights.blackcoffer.com/what-if-the-c...,what if the creation is taking over the creato...
2,39,https://insights.blackcoffer.com/what-jobs-wil...,what jobs will robots take from humans in the ...
3,40,https://insights.blackcoffer.com/will-machine-...,will machine replace the human in the future o...
4,41,https://insights.blackcoffer.com/will-ai-repla...,will ai replace us or work with us? - blackcof...


## Calculating all the necessary variables

In [21]:
for i in range(0, len(med_data)):
    filtered_words = []
    pos_words = {positive[i]: 0 for i in range(0, len(positive))}   # dictionary of positve words 
    neg_words = {negative[i]: 0 for i in range(0, len(negative))}   # dictionary of  negative words
    tokenized_sentences = sent_tokenize(med_data['articles'][i])    # creating list of sentences
    tokenized_words = word_tokenize(med_data['articles'][i])        # creating list of words
    for w in tokenized_words:           # Removing all the stopwords from the tokenized words.
        if w not in final_stopwords:
            filtered_words.append(w) 
    for pos in positive:                  # Calcutaing how many times a positive word occur in the filtered words
        for fil1 in filtered_words:
            if pos == fil1:
                pos_words[pos] += 1
            continue
    pos_score = sum(pos_words.values())    # Total number of positive words in then article
    med_data.loc[i:, 'Positive Score'] = pos_score
   
    for neg in negative:                  # Calcutaing how many times a negative word occur in the filtered words
        for fil2 in filtered_words:
            if neg == fil2:
                neg_words[neg] += 1
                continue
    neg_score = sum(neg_words.values())    # Total number of negative words in then article
    med_data.loc[i:, 'Negative Score'] = neg_score
    med_data.loc[i:, 'Polarity'] =  (pos_score - neg_score)/((pos_score + neg_score) + 0.000001)    # Polarity of the article
    med_data.loc[i:, 'Subjectivity Score'] = (pos_score + neg_score)/(len(filtered_words) + 0.000001)
    average_sentence_length = int(len(filtered_words)/len(tokenized_sentences))
    med_data.loc[i:, 'Average Sentence Length'] = average_sentence_length
    complex_count = 0            # Calculating the number of complex words
    total_syl = 0
    vowels = ['a','e','i','o','u']
    for t in filtered_words:
        syl = 0
        for u in t:
            for v in vowels:
                if u == v:
                    syl +=1
                else:
                    continue
        total_syl += syl
        if syl > 2:
            complex_count += 1
        else:
            continue
    percent = (complex_count/len(filtered_words))*100   # %age of complex words in the article
    med_data.loc[i:, 'Percentage Of Complex Words'] = percent
    fog = 0.4 * (average_sentence_length + percent)   # Fogging index value for each article
    med_data.loc[i:, 'Fog Index'] = fog
    med_data.loc[i:, 'Average Number of Words Per Sentence'] = int(len(filtered_words)/len(tokenized_sentences))
    med_data.loc[i:, 'Complex Word Count'] = complex_count  # total complex words in the article
    filtered_string = ' '.join([str(item) for item in filtered_words])    # Removing the punctuation from the article to calculate the word count"""
    exclude = set(string.punctuation)
    filtered_article = ''.join(ch for ch in filtered_string if ch not in exclude)
    filtered_article = list(filtered_article.split(' '))
    med_data.loc[i:, 'Word Count'] = len(filtered_article)      # total words in the article without the punctuation
    med_data.loc[i:, 'Syllable Count Per Word'] = int(total_syl/ len(filtered_words))
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)  # Calculating the number of times personal pronouns are used in the article"""
    pronouns = pronounRegex.findall(med_data['articles'][i])
    med_data.loc[i:, 'Personal Pronouns'] = len(pronouns)
    characters = 0
    for i in filtered_words:
        characters += len(i)
    med_data.loc[i:, 'Average Word Length'] = characters/len(filtered_words)

In [23]:
output_data = pd.DataFrame(med_data)
output_data.drop('articles',axis= 1, inplace = True)

Unnamed: 0,URL_ID,URL,Positive Score,Negative Score,Polarity,Subjectivity Score,Average Sentence Length,Percentage Of Complex Words,Fog Index,Average Number of Words Per Sentence,Complex Word Count,Word Count,Syllable Count Per Word,Personal Pronouns,Average Word Length
0,37,https://insights.blackcoffer.com/ai-in-healthc...,66.0,30.0,0.375,0.074419,23.0,51.472868,29.789147,23.0,664.0,1290.0,2.0,3.0,6.129206
1,38,https://insights.blackcoffer.com/what-if-the-c...,59.0,36.0,0.242105,0.103373,13.0,40.043526,21.21741,13.0,368.0,919.0,2.0,9.0,6.129206
2,39,https://insights.blackcoffer.com/what-jobs-wil...,68.0,36.0,0.307692,0.090043,16.0,51.082251,26.8329,16.0,590.0,1155.0,2.0,5.0,6.129206
3,40,https://insights.blackcoffer.com/will-machine-...,62.0,22.0,0.47619,0.091205,11.0,43.10532,21.642128,11.0,397.0,921.0,2.0,20.0,6.129206
4,41,https://insights.blackcoffer.com/will-ai-repla...,57.0,23.0,0.425,0.071749,16.0,41.345291,22.938117,16.0,461.0,1115.0,2.0,20.0,6.129206


In [25]:
file_name = 'Output Data Structure.xlsx'  
#saving in the excel format
output_data.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.
