# Importing required libraries 

In [332]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import os
import requests
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from textstat import syllable_count
import string
from nltk.corpus import stopwords

In [333]:
#Creating a directory called files to store all txt files

directory_path = "C:/Users/Srushti/Documents/blackcoffer"
filesdir = "files"

directory = os.path.join(directory_path,filesdir)

if not os.path.exists(directory):
    os.makedirs(directory)

# Data Extraction

In [334]:
#Creating files for each URL in the dataframe

def createfile(url_id,url):
    try:
        response = requests.get(url)
        response.raise_for_status
        
        file = urllib.request.urlopen(url)
        contents = file.read()
        htmlfile = contents.decode() #to decode text from bytecode format

        soup = BeautifulSoup(htmlfile, 'html.parser')

        f = open(os.path.join(directory,str(url_id)+".txt"), "w")

        for data in soup.find_all("h1"):
            title = data.get_text()
            f.writelines(title)
    #         print(title)

        class_name = re.compile(r'td-post-content')

        for item in soup.find_all("div",class_=class_name):
                text = item.get_text()
                f.writelines(text)
#                 print(text)
        f.close()
    
    #Creating files with "URL not found" for cases when a page is not found
    except:    
        f = open(os.path.join(directory,str(url_id)+".txt"), "w")
        f.write("URL not found")
        f.close

In [335]:
#Removing texts under images and footers that were extracted
def removefig(filename):
    with open(filename,"r") as ipfile:
        temp = open("temp.txt","w")
        for i in ipfile:
            if "Fig:" in i or "Blackcoffer Insights" in i:
                continue
            else:
                temp.write(i)
        temp.close()
    os.remove(filename)
    os.rename("temp.txt",filename)

## Loading Input.xlsx

In [336]:
input_df = pd.read_excel("Input.xlsx")

In [337]:
input_df.head()

Unnamed: 0,URL_ID,URL
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...
3,40.0,https://insights.blackcoffer.com/will-machine-...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...


## Calling the function to createfiles

In [338]:
for i in range(0,input_df.shape[0]):
    createfile(input_df["URL_ID"][i],input_df["URL"][i])

### Removing the footer and any image names

In [339]:
for filename in os.listdir(directory):
    file_path = os.path.join(directory,filename)
    removefig(file_path)
#     print(file_path)
#     print(i)

# Sentimental Analysis

### Removal of Stop words

#### Creating a list of stopwords

In [340]:
stop_words = []

# Iterate over all files in the stop words directory
stop_words_dir = os.path.join(directory_path, "StopWords")
for filename in os.listdir(stop_words_dir):
    filepath = os.path.join(stop_words_dir, filename)
    with open(filepath, 'r') as f:
#         # Read the stop words from the file and add them to the set
        for line in f:
            if "|" in line: # to remove additional information about the stopword
                newline = line.split('|')[0].strip()
#                 print(newline)
                stop_words.append(newline.strip())
            else:
                stop_words.append(line.strip())



In [341]:
# Remove the stop words given in the folder from your text in each files
def removestopwords(filename):
    with open(filename,"r") as file:
        #Creating a temporary file to remove stopwords
            with open("temp.txt","w") as temp:
                text = file.read()
                words = text.split()
                
                #remove stop words
                filtered_words = [word.lower() for word in words if word.upper() not in stop_words]
                filtered_text = ' '.join(filtered_words)
                
            #     print(len(words))
            #     print(len(filtered_words))
#                 print(filtered_text)
                #Write to the temporary file
                temp.write(filtered_text)
    os.remove(filename) #delete the file
    os.rename("temp.txt",filename)#rename temporary file with filename


### Calling the function to clean using stop words

In [342]:
for filename in os.listdir(directory):
    file_path = os.path.join(directory,filename)
#     print(file_path)
    removestopwords(file_path)

### Creating dictionary of positive and negative words

In [343]:
dictionary = {}

In [344]:
posneg_path = os.path.join(directory_path, "MasterDictionary")


for filename in os.listdir(posneg_path):
    file_path = os.path.join(posneg_path,filename)
    with open(file_path,"r") as file:
        text = file.read()
        words = text.split()
        #words from "negative-words.txt" is added to the dictionary with 
        #its key value as "negative" else add it under the key value "positive"
        if filename == "negative-words.txt":
            dictionary["negative"]=words
        else:
            dictionary["positive"]=words    

In [345]:
#Removing stopwords (if any) from the dictionary

for i in dictionary:
    for j in dictionary[i]:
#         print(j)
        if j in stop_words:
            dictionary[i].remove(j)

#### Creating lists to store values calulated below

In [346]:
total_words = []
total_sent = []
complex_words = []
words_count = []
syllable = []
personal_pro = []
average_word_length = []
words_per_sent = []
average_words_per_sentence = []

### Extracting Derived Variables

In [347]:
positive=[]
negative=[]

In [348]:
def posneg_score(filename):
    pos=0
    neg=0
    with open(filename,"r") as file:
        text = file.read()
        tokens = word_tokenize(text)
        total_words.append(len(tokens))
        
        for token in tokens:
            if token in dictionary["positive"]:
                pos+=1
            elif token in dictionary["negative"]:
                neg+=1
#         print(neg)
#         print(pos)  
        positive.append(pos)
        negative.append(neg)

### Counting number of sentence in each file

In [349]:
def count_sentences(filename):
    with open(filename,"r") as file:
        text = file.read()
        sentences = sent_tokenize(text)
        total_sent.append(len(sentences))

## Analysis of readbility

In [350]:
#To check if a word is complex word
def is_complex_word(word):
    #Use function "syllable_count" from nltk library to find the syllables in a word
    syllables = syllable_count(word)
    if syllables >= 2:
        return True
    else:
        return False

#Count number of complex words
def complex_words_count(filename):
    complexw = 0
    with open(filename,"r") as file:
        text = file.read()
        #Tokenize the text
        tokens = word_tokenize(text)
        for token in tokens:
            if is_complex_word(token):
                complexw+=1
        complex_words.append(complexw)

## Average Number of Words Per Sentence

In [351]:
#Calculate avgerage words per sentence for each file
def avg_words_per_sentence(filename):
    with open(filename, 'r') as file:
        text = file.read()
        #Toknize sentences using "sent_tokenize" fro nltk library
        sentences = sent_tokenize(text)
        total_words = 0
        
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            total_words += len(words)
        
        #Calulating average
        avg_words = total_words / len(sentences)
        average_words_per_sentence.append(avg_words)

## Word Count

In [352]:
#Counting all the words in a file and storing them in words_count array as
def word_count(filename):
    with open(filename,"r") as file:
        with open("temp.txt","w") as temp:
            text = file.read()
            text = text.translate(str.maketrans('', '', string.punctuation))
            tokens = word_tokenize(text)

            #Removing stopwords using stopwords class of nltk library
            stop_words = set(stopwords.words('english'))
            filtered_tokens = [token for token in tokens if token not in stop_words]
            words_count.append(len(tokens))
            
            # join the filtered tokens into a string
            filtered_text = ' '.join(filtered_tokens)
            temp.write(filtered_text)
            
    os.remove(filename) #delete the file
    os.rename("temp.txt",filename) #remaneing the temporary file with filename

## Syllable Count Per Word

In [353]:
#Count syllables per word for each file and store in "syllable" list
def syllables_count(filename):
    count_syllables = 0 
    
    #Here we consider syllables by counting the number of vowels 
    #unlike the previous time while calulating complex words, 
    #where we used syllables_count fucntionfrom inbuilt library
    
    vowels = "aeiou"
    with open(file_path, 'r') as f:
        text = f.read()
        # Count syllables for each word
        words = text.split()
        for word in words:
            #ignoring words that end with "ed" or "es"
            if word[-2:] in ['es', 'ed']:
                continue
            else:
                for letter in word:
                    if letter in vowels:
                        count_syllables+=1
        
        #calulating average
        avg_syllables = count_syllables/len(words)
        syllable.append(avg_syllables)

## Personal Pronouns

In [354]:
#Count occurences of personal pronouns
def personal_pronouns(filename):
    with open(filename, 'r') as file:
        text = file.read()
        #use regex to find counts of words with I,we,my,ours,us
        counts = {
            "I": len(re.findall(r"\bI\b", text)),
            "we": len(re.findall(r"\bwe\b", text)),
            "my": len(re.findall(r"\bmy\b", text)),
            "ours": len(re.findall(r"\bours\b", text)),
            "us": len(re.findall(r"\bus\b", text, re.IGNORECASE))
        }
        
        # Exclude US from the counts
        if 'US' in text:
            counts['us'] -= len(re.findall(r"\bUS\b", text))
        personal_pro.append(sum(counts.values()))


## Average Word Length

In [355]:
#Calulate average word length for each file
def avg_word_length(filename):
    with open(filename, "r") as file:
        text = file.read()
        words = text.split()
        word_len = [len(word) for word in words]
        
        #average
        avg_word_len = sum(word_len)/len(words)
        average_word_length.append(avg_word_len)


### Creating a list of file names in order

In [356]:
filenames = [str(url)+".txt" for url in input_df["URL_ID"]]

## Calling all the above functions

In [357]:
for filename in filenames:
    file_path = os.path.join(directory,filename)
    posneg_score(file_path)
    count_sentences(file_path)
    avg_words_per_sentence(file_path)
    word_count(file_path)
    complex_words_count(file_path)
    avg_word_length(file_path)
    syllables_count(file_path)
    personal_pronouns(file_path)
    
    

## Calculating polarity, subjectivity, average sentence length, complex percent, fog index

In [358]:
polarity = []
subjectivity = []
average_sentence_length = []
complex_percent = []
fog_index = []


In [359]:
for i in range(0,len(positive)):
    res = (positive[i]-negative[i])/((positive[i]+negative[i])+0.000001)
    polarity.append(res)
    
    subject = (positive[i]+negative[i])/(total_words[i]+0.000001)
    subjectivity.append(subject)
    
    avglen = total_words[i]/total_sent[i]
    average_sentence_length.append(avglen)
    
    comp = complex_words[i]/total_words[i]
    complex_percent.append(comp)
    
    fogin = 0.4*(avglen + comp)
    fog_index.append(fogin)
    
    

## Adding all values to the input dataframe

In [360]:
input_df["POSITIVE SCORE"] = positive
input_df["NEGATIVE SCORE"] = negative
input_df["POLARITY SCORE"] = polarity
input_df["SUBJECTIVITY SCORE"] = subjectivity
input_df["AVG SENTENCE LENGTH"] = average_sentence_length
input_df["PERCENTAGE OF COMPLEX WORDS"] = complex_percent
input_df["FOG INDEX"] = fog_index
input_df["AVG NUMBER OF WORDS PER SENTENCE"] = average_words_per_sentence
input_df["COMPLEX WORD COUNT"] = complex_words
input_df["WORD COUNT"] = words_count
input_df["SYLLABLE PER WORD"] = syllable
input_df["PERSONAL PRONOUNS"] = personal_pro
input_df["AVG WORD LENGTH"] = average_word_length

In [361]:
input_df.head(20)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,64,32,0.333333,0.072508,17.421053,0.602719,7.209509,17.421053,798,1129,2.375345,1,7.333027
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,60,38,0.22449,0.099695,12.443038,0.422177,5.146086,12.443038,415,794,2.042313,3,6.593794
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,65,37,0.27451,0.085,14.117647,0.595,5.885059,14.117647,714,1007,2.348517,2,7.353814
3,40.0,https://insights.blackcoffer.com/will-machine-...,68,28,0.416667,0.093023,10.863158,0.506783,4.547976,10.863158,523,867,2.295792,3,6.621287
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,58,25,0.39759,0.067535,15.556962,0.510171,6.426853,15.556962,627,1045,2.166667,8,6.753715
5,42.0,https://insights.blackcoffer.com/man-and-machi...,45,22,0.343284,0.074527,14.983333,0.483871,6.186882,14.983333,435,746,2.170623,3,6.734421
6,43.0,https://insights.blackcoffer.com/in-future-or-...,22,11,0.333333,0.064579,11.355556,0.540117,4.758269,11.355556,276,411,2.023499,2,6.874674
7,44.0,https://insights.blackcoffer.com/how-neural-ne...,0,0,0.0,0.0,2.0,0.0,0.8,2.0,0,2,1.5,0,4.0
8,45.0,https://insights.blackcoffer.com/how-machine-l...,36,13,0.469388,0.10041,13.942857,0.485656,5.771405,13.942857,237,422,1.984169,0,6.279683
9,46.0,https://insights.blackcoffer.com/deep-learning...,66,35,0.306931,0.071835,17.575,0.543385,7.247354,17.575,764,1207,2.296496,1,6.791554


## Storing the dataframe in a csv file

In [362]:
input_df.to_csv("Output.csv",index=False)