In [1]:
import os
import requests
import pandas as pd 
import openpyxl
from bs4 import BeautifulSoup
import string
import re
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91836\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91836\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data extraction

### Extracting data from input file url and saving it into url_id.txt

In [3]:
file = 'Input.xlsx'
data = pd.ExcelFile(file)
df = data.parse('Sheet1')

ps = openpyxl.load_workbook('Input.xlsx')
sheet = ps['Sheet1']
no_of_articles=0
for row in range(2, sheet.max_row + 1):
    url = sheet['B' + str(row)].value
    url_id = int(sheet['A' + str(row)].value)
    # Each value in a cell is represented by a column letter and a row number. So #the first element in the sheet is B1, next column C1 and so on. This enables #to iterate over the entire cells.
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0"}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    try:
        title=soup.find('h1',class_="entry-title")
        title=title.text.replace('\n',"")
    except AttributeError:
        title=""
    try:
        content=soup.findAll(attrs={'class':'td-post-content'})
        content=content[0].text.replace('\n',"")
    except (AttributeError,IndexError) as e :
        content=""
    no_of_articles+=1
    f = open("articles/{}.txt".format(url_id),"w", encoding="utf-8")
    f.write(title+'\n')
    f.write(content)
    f.close()



# Text Analysis

In [6]:
#Extracting all the stopwords

with open("StopWords/StopWords_Auditor.txt","r") as s1,open("StopWords/StopWords_Currencies.txt","r") as s2,open("StopWords/StopWords_DatesandNumbers.txt","r") as s3,open("StopWords/StopWords_Generic.txt","r") as s4,open("StopWords/StopWords_GenericLong.txt","r") as s5,open("StopWords/StopWords_Geographic.txt","r") as s6,open("StopWords/StopWords_Names.txt","r") as s7:
    s1,s2,s3,s4,s5,s6,s7 = s1.read(),s2.read(),s3.read(),s4.read(),s5.read(),s6.read(),s7.read()
    s = s1+s2+s3+s4+s5+s6+s7
stop_words = s.split("\n")

#Creating a dictionary of Positive and Negative words

with open("C:/Users/91836/data_science_proj/black cofer/MasterDictionary/positive-words.txt","r") as pos:
    positive_dictionary = pos.read().split("\n") 
    
with open("C:/Users/91836/data_science_proj/black cofer/MasterDictionary/negative-words.txt","r") as neg:
    negative_dictionary = neg.read().split("\n")
            

### Useful functions :

In [7]:

def remove_stopwords(words, stop_words):
        return [word for word in words if word not in stop_words]

def count(dictionary, words):
    count = 0
    for x in words:
        if(x in dictionary):
            count+=1
    return count

def polarity(positive_score, negative_score):
    return ((positive_score - negative_score)/((positive_score + negative_score)+ 0.000001))

def subjectivity(positive_score, negative_score, num_words):
    return ((positive_score+negative_score)/(num_words+ 0.000001))

def syllable_count(word):
    if(len(word) > 2 and (word[-2:] == 'es' or word[-2:] == 'ed')):
        return 0
    count=0
    vowel=['a','e','i','o','u']
    for x in word:
        if x in vowel:
            count+=1
    return count 
    
def complex_words(words):
    num_complexword=0
    for word in words:
         if(syllable_count(word)>2):
                        num_complexword = num_complexword+1
    return num_complexword

def personal_pronouns(text):
    pronounList = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronouns = pronounList.findall(text)
    return(len(pronouns))      

def clean_words(text):
    #Remove Punctuation
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    tokens= word_tokenize(text)
    
    #removing the stop words (using stopwords class of nltk package)
    my_stop_words = stopwords.words('english')
    final_words = [token for token in tokens if  token not in my_stop_words]
    
    return(len(final_words))

def file_not_empty(file_name):
    return os.path.exists(file_name) and os.stat(file_name).st_size > 5

In [8]:
#Creating dataframe
df_cols=["URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE","SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS","FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT","WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"]
df = pd.DataFrame(columns=df_cols)
df

### Scraping .txt files for text analysis of each article


In [9]:
row=2
for i in range(no_of_articles):
    
    #All input variables in “Input.xlsx”
    url_id = int(sheet['A' + str(row)].value)
    url = sheet['B' + str(row)].value
    row+=1
    
    file_path='articles/{}.txt'.format(url_id)
    
    if(file_not_empty(file_path)):
        
        #Extracting article from saved files 
        txtFile=open(file_path,'r',encoding="utf-8")
        wholeF=txtFile.readlines()
        title=wholeF[0]
        content=wholeF[1]

        #Tokenization
        word_tokens = word_tokenize(content)

        #removing stop words
        words = remove_stopwords(word_tokens, stop_words)
        num_words = len(words)


        sentences = sent_tokenize(content)
        num_sentences = len(sentences)


        #Extracting Derived variables

        positive_score = count(positive_dictionary, words)

        negative_score = count(negative_dictionary, words)

        polarity_score = polarity(positive_score, negative_score)

        subjectivity_score = subjectivity(positive_score, negative_score, num_words)


        #Analysis of Readability

        average_sentence_length = num_words/num_sentences

        num_complexwords= complex_words(words)
        percentage_complexwords = (num_complexwords/num_words)

        fog_index =  0.4*(average_sentence_length + percentage_complexwords)


        #Average Number of Words Per Sentence

        avg_no_of_words_per_sentence = num_words/num_sentences


        #Word Count of  word without stopwords and punctutaion
        word_count = clean_words(content)

        # SYLLABLE PER WORD
        total_syllable_count=0
        for word in words:
            total_syllable_count+= syllable_count(word)

        syllable_per_word = total_syllable_count/num_words

        # Personal Pronouns
        pronouns= personal_pronouns(content)  

        # Average Word Length    
        average_word_length=sum(len(word) for word in words)/num_words

        data=[url_id,url,positive_score,negative_score,polarity_score,subjectivity_score,average_sentence_length,percentage_complexwords,fog_index,avg_no_of_words_per_sentence,num_complexwords,word_count,syllable_per_word,pronouns,average_word_length]
        df.loc[i]=data
    else:
        print("{}.txt is empty".format(url_id))
        data=[url_id,url,0,0,0,0,0,0,0,0,0,0,0,0,0]
        df.loc[i]=data

44.txt is empty
57.txt is empty
144.txt is empty


In [10]:
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,62,31,0.333333,0.074940,23.415094,0.406124,9.528487,23.415094,504,1171,2.040290,1,6.514102
1,38,https://insights.blackcoffer.com/what-if-the-c...,55,36,0.208791,0.103881,12.882353,0.262557,5.257964,12.882353,230,810,1.521689,7,5.294521
2,39,https://insights.blackcoffer.com/what-jobs-wil...,65,34,0.313131,0.088078,16.776119,0.375445,6.860626,16.776119,422,1036,1.936833,3,6.258007
3,40,https://insights.blackcoffer.com/will-machine-...,56,22,0.435897,0.083601,11.961538,0.317256,4.911518,11.961538,296,964,1.787781,17,5.673098
4,41,https://insights.blackcoffer.com/will-ai-repla...,49,23,0.361111,0.063158,16.764706,0.301754,6.826584,16.764706,344,1052,1.739474,16,5.684211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,20,28,-0.166667,0.078947,16.432432,0.315789,6.699289,16.432432,192,566,1.824013,9,6.245066
110,147,https://insights.blackcoffer.com/the-future-of...,40,12,0.538462,0.046018,23.061224,0.293805,9.342012,23.061224,332,998,1.644248,2,5.572566
111,148,https://insights.blackcoffer.com/big-data-anal...,26,44,-0.257143,0.086207,13.311475,0.359606,5.468433,13.311475,292,713,1.865764,2,5.697044
112,149,https://insights.blackcoffer.com/business-anal...,29,4,0.757576,0.071895,25.500000,0.437908,10.375163,25.500000,201,432,2.198257,0,6.989107


### Saving the computed variables in output.csv file

In [11]:
df.to_csv("Output.csv",index=False)