In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q xlrd
import requests
import string
import re
import os
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from google.colab import files



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# reading the excel input file into a dataframe
df = pd.read_excel('/content/drive/MyDrive/20211030 Test Assignment/Input.xlsx')
l = df.shape[0]
df

Unnamed: 0,URL_ID,URL
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...
3,40.0,https://insights.blackcoffer.com/will-machine-...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...
110,147.0,https://insights.blackcoffer.com/the-future-of...
111,148.0,https://insights.blackcoffer.com/big-data-anal...
112,149.0,https://insights.blackcoffer.com/business-anal...


In [None]:
# reading all the stopwords into a list 
StopWordsList = []
punc_list = [i for i in string.punctuation] # list of punctuations
folder_path = '/content/drive/MyDrive/20211030 Test Assignment/StopWords'
for file in os.listdir(folder_path):
  with open(folder_path + "/" + file,'r', encoding = "ISO-8859-1") as f:
    for line in f:
      StopWordsList.append(line.split(None, 1)[0].lower())

In [None]:
# reading list of positive words from the MasterDictionary
file_pos = open('/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/positive-words.txt','r')
pos_words = file_pos.read().split()
file_pos.close()

# reading list of negative words from the MasterDictionary
file_neg = open('/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/negative-words.txt','r', encoding = "ISO-8859-1")
neg_words = file_neg.read().split()
file_neg.close()

In [None]:
# function to filter & count all the positive words from the article text using pos_words list
def fn_positive_count(filtered_text):
  pos_count = filter(lambda s : s in pos_words, filtered_text)
  return len(list(pos_count)) # positive score

# function to filter & count all the negative words from the article text using neg_words list
def fn_negative_count(filtered_text):
  neg_count = filter(lambda s : s in neg_words, filtered_text)
  return len(list(neg_count)) # negative score

# function to get polarity score using positive & negative score
def fn_polarity_score(pos, neg):
  pol_score = (pos - neg) / ((pos + neg) + 0.000001)
  return pol_score

# function to get subjectivity score using positive & negative score
def fn_subjectivity_score(pos, neg, tot_cleaned_words):
  subject_score = (pos + neg) / ((tot_cleaned_words) + 0.000001)
  return subject_score

# function to get syllable count of each word of article
def fn_syllable_count(word):
  count = 0
  vowels = "aeiouy"
  if word[0] in vowels:
      count += 1
  for index in range(1, len(word)):
      if word[index] in vowels and word[index - 1] not in vowels:
          count += 1
  if word.endswith("e"):
      count -= 1
  if (word.endswith("d") or word.endswith("s")) and word[-2] == ("e"): # handling -es & -ed exceptions
    count -= 1
  if count == 0:
      count += 1
  return count

# function to get complex word count (word with more than 2 syllables)
def fn_complex_word_count(cleaned_text_list):
  complex_word_count = 0
  syllable_per_word = 0
  total_syllable_count = 0 # number of syllables in the whole article
  for word in cleaned_text_list: 
    syllable_per_word = fn_syllable_count(word)
    total_syllable_count += syllable_per_word
    if syllable_per_word > 2:
      complex_word_count += 1
  return complex_word_count, total_syllable_count

# function to get percentage of complex words
def fn_percent_complex(complex_words, total_words):
  return (complex_words/total_words)

# function to get fog index using average sentence length & percentage of complex words
def fn_fog_index(avg_sentence, percent_complex):
   fog_index = 0.4 * (avg_sentence + percent_complex)
   return fog_index

# function to get average words per sentence using total words & total sentences
def fn_avg_words_sentence(total_words, total_sent):
  return (total_words/total_sent)

# function to get the count of all personal pronouns from the text
def fn_personal_pronouns(text):
  personal_pronouns = re.compile(r'\b(I|[Ww]e|[Mm]y|[Oo]urs|[Uu]s)\b') #using regular expression
  pronoun_count = personal_pronouns.findall(text)
  return len(pronoun_count)

In [None]:
# creating a dataframe to store all the variables
output_df = pd.DataFrame(columns=[
      'URL_ID',
      'URL',
      'POSITIVE SCORE',
      'NEGATIVE SCORE',
      'POLARITY SCORE',
      'SUBJECTIVITY SCORE',
      'AVG SENTENCE LENGTH',
      'PERCENTAGE OF COMPLEX WORDS',
      'FOG INDEX',
      'AVG NUMBER OF WORDS PER SENTENCE',
      'COMPLEX WORD COUNT',
      'WORD COUNT',
      'SYLLABLE PER WORD',
      'PERSONAL PRONOUNS',
      'AVG WORD LENGTH',
    ])

In [None]:
# folder to store all scraped articles 
folder_path = "articles/"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

idx = 0
# for loop to iterate through all the url articles 
for i in range(0,l):
   url = df.iloc[i]["URL"] # getting all urls 
   url_id = str(int(df.iloc[i]["URL_ID"]))

   # using beautifulSoup to scrape the title and body of articles
   headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
   }
   htmlContent = requests.get(url, headers=headers).text
   soup = BeautifulSoup(htmlContent, 'lxml')
   # since there were 3 articles (44, 57 and 144) missing, i used try & catch block to skip the articles that are not available on website (404 error) 
   try:
     title = soup.find('h1', class_ = 'entry-title').text 
     body = soup.find('div', class_ = 'td-post-content').text
   except AttributeError:
     print("404 Article Not Found.")
   else:
     file_path = folder_path + url_id
     # writing the title in each file
     file1 = open(file_path,"w+")
     file1.writelines(title)
     file1.close()
     # appending the body to the file 
     file1 = open(file_path,"a+")
     file1.writelines(body)
     file1.close()
     # open text file in read mode
     file1 = open(file_path, "r")
     # reading the text from the file
     data = file1.read() 
     file1.close()

     # using nltk word and sentence tokenizer to get tokens of word & sentences in lists
     tokens = word_tokenize(data)
     sent_tokens = sent_tokenize(data) # sentence tokenize

     # converting each word token into lowercase
     for pos in range(0, len(tokens)):
       tokens[pos] = tokens[pos].lower()

     # removing all stopwords from the word tokens list to get filtered text
     filtered_text = [t for t in tokens if not t in StopWordsList]

     # removing all punctuations using filter to get cleaned text 
     cleaned_text = filter(lambda i: i not in punc_list, filtered_text)
     cleaned_text_list = list(cleaned_text) 
     total_words = len(cleaned_text_list)
     
     # getting total character length of each word from the cleaned text list
     total_char_length = 0
     for word in cleaned_text_list:
       char = len(word)
       total_char_length += char

     pos_count = fn_positive_count(filtered_text) 

     neg_count = fn_negative_count(filtered_text)

     polarity = fn_polarity_score(pos_count, neg_count)

     subjectivity = fn_subjectivity_score(pos_count, neg_count, total_words)

     avg_sentence_len = (total_words / len(sent_tokens))

     average_words_per_sent = (total_words / len(sent_tokens))

     complex_word_count, total_syllable_count = fn_complex_word_count(cleaned_text_list)
     
     percent_complex_count = fn_percent_complex(complex_word_count, total_words)
     
     fog_index = fn_fog_index(avg_sentence_len, percent_complex_count)
     
     personal_pronoun_count = fn_personal_pronouns(data)
     
     avg_syllable_count =  total_syllable_count / total_words
     
     avg_word_length = total_char_length / total_words

     # filling in the dataframe with all the output variables
     output_df.loc[idx] = pd.Series({
      'URL_ID': url_id,
      'URL': url,
      'POSITIVE SCORE': pos_count,
      'NEGATIVE SCORE': neg_count,
      'POLARITY SCORE': polarity,
      'SUBJECTIVITY SCORE': subjectivity,
      'AVG SENTENCE LENGTH': avg_sentence_len,
      'PERCENTAGE OF COMPLEX WORDS': percent_complex_count,
      'FOG INDEX': fog_index,
      'AVG NUMBER OF WORDS PER SENTENCE': average_words_per_sent,
      'COMPLEX WORD COUNT': complex_word_count,
      'WORD COUNT': total_words,
      'SYLLABLE PER WORD': avg_syllable_count,
      'PERSONAL PRONOUNS': personal_pronoun_count,
      'AVG WORD LENGTH': avg_word_length
      })
     idx = idx + 1

# print(output_df)
output_df.to_excel('Output Data Structure.xlsx', index=False)
# files.download('Output Data Structure.xlsx')

404 Article Not Found.
404 Article Not Found.
404 Article Not Found.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>