# Downloading necessary dependencies

In [44]:
# # Uncomment during first run >>

# !pip install contractions
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# !pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[K     |████████████████████████████████| 105 kB 5.1 MB/s 
[?25hCollecting pyphen
  Downloading pyphen-0.12.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 36.6 MB/s 
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.12.0 textstat-0.7.3


# Importing necessary libraries

In [194]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize # Can also use RegexpTokenizer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
import contractions
from textstat.textstat import textstatistics
import string

# Data Manipulation

In [141]:
df = pd.read_csv('blackcoffer-consulting-file.csv')

In [142]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [143]:
# Instantiating wordnet lemmatizer

lemma = WordNetLemmatizer()

In [144]:
# Instantiating the english language stopwords dictionary

stop_words = stopwords.words('english')

In [145]:
# Function to pre-process the text data
def text_prep(x: str) -> list:
     corp = str(x).lower()
     corp = contractions.fix(corp)
     corp = re.sub('[^a-zA-Z]+',' ', corp).strip() 
     tokens = word_tokenize(corp)
     words = [t for t in tokens if t not in stop_words]
     lemmatize = [lemma.lemmatize(w) for w in words]
     return lemmatize

In [146]:
# Applying the function on the data
preprocess_tag = [text_prep(i) for i in df['Text_Contents']]
df["preprocess_txt"] = preprocess_tag

# 1. Sentiment Analysis

## Importing Opinion Lexicon of Positive and Negative words

In [147]:
file = open('negative-words.txt', 'r')
neg_words = file.read().split()
file = open('positive-words.txt', 'r')
pos_words = file.read().split()

In [148]:
df['num_words'] = df['preprocess_txt'].map(lambda x: len(x))

## 1.1. Positive and Negative Scores

In [149]:
num_pos = df['preprocess_txt'].map(lambda x: len([i for i in x if i in pos_words]))
df['Positive'] = num_pos
num_neg = df['preprocess_txt'].map(lambda x: len([i for i in x if i in neg_words]))
df['Negative'] = num_neg

## 1.2 Polarity Score

In [150]:
df['Polarity'] = round((df['Positive'] - df['Negative'])/(df['Positive'] + df['Negative'] + 0.000001), 2)

## 1.3 Subjectivity Score

In [151]:
df['Subjectivity'] = round((df['Positive'] + df['Negative'])/(df['num_words'] + 0.000001), 2)

In [152]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity
86,88.0,https://insights.blackcoffer.com/how-covid-19-...,I would rather pay cash – Before COVID-19. I ...,"[would, rather, pay, cash, covid, would, rathe...",302,32,13,0.42,0.15
8,9.0,https://insights.blackcoffer.com/how-big-data-...,Big data refers to large sets of unstructured...,"[big, data, refers, large, set, unstructured, ...",807,84,26,0.53,0.14
134,136.0,https://insights.blackcoffer.com/advance-analy...,Recent years have seen significant growth in ...,"[recent, year, seen, significant, growth, pers...",763,73,27,0.46,0.13
145,147.0,https://insights.blackcoffer.com/the-prospecti...,"“As you sow, so you reap,” because “God watch...","[sow, reap, god, watch, everything, often, won...",653,47,26,0.29,0.11
123,125.0,https://insights.blackcoffer.com/ensuring-grow...,The generally accepted\nsolution to economies...,"[generally, accepted, solution, economy, stagn...",610,38,26,0.19,0.1


# 2. Analysis of Readability

## 2.1. Average Sentence Length

In [153]:
df['num_sentences'] = df['Text_Contents'].map(lambda x: len(sent_tokenize(x)))

In [154]:
df['avg_sentence_len'] = round(df['num_words']/df['num_sentences'], 1)

In [155]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len
21,22.0,https://insights.blackcoffer.com/man-and-machi...,Where is this disruptive technology taking us...,"[disruptive, technology, taking, u, take, leav...",653,68,36,0.31,0.16,55,11.9
104,106.0,https://insights.blackcoffer.com/should-celebr...,"In today’s day and age of hyper-connectivity,...","[today, day, age, hyper, connectivity, celebri...",989,84,39,0.37,0.12,63,15.7
71,73.0,https://insights.blackcoffer.com/how-voice-sea...,Finding ways to make using the Internet easie...,"[finding, way, make, using, internet, easier, ...",462,62,23,0.46,0.18,38,12.2
154,156.0,https://insights.blackcoffer.com/big-data-mark...,Perfect information. First Degree Price Discr...,"[perfect, information, first, degree, price, d...",553,51,28,0.29,0.14,44,12.6
56,58.0,https://insights.blackcoffer.com/can-you-be-gr...,The word “leadership” can bring to mind a var...,"[word, leadership, bring, mind, variety, image...",224,42,8,0.68,0.22,16,14.0


## 2.2. Percentage of Complex Words

In [156]:
def syllables_count(text):
  return textstatistics().syllable_count(text)

In [157]:
def complex_words(text):
  diff_words_set = set()
  words = text
  for word in words:
    syllable_count = syllables_count(word)
    if syllable_count > 2:
      diff_words_set.add(word)
  return len(diff_words_set)

In [158]:
df['complex_words'] = df['preprocess_txt'].apply(lambda x: complex_words(x))

In [159]:
df['complex_words_prop'] = round((df['complex_words']/df['num_words']), 2)

In [160]:
df.sample(n = 5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop
20,21.0,https://insights.blackcoffer.com/will-ai-repla...,“Machine intelligence is the last invention t...,"[machine, intelligence, last, invention, human...",927,89,44,0.34,0.14,76,12.2,173,0.19
83,85.0,https://insights.blackcoffer.com/what-is-the-r...,"Epidemics, in general, have both direct and i...","[epidemic, general, direct, indirect, cost, as...",652,38,69,-0.29,0.16,50,13.0,142,0.22
96,98.0,https://insights.blackcoffer.com/lessons-from-...,"“The more you know about the past, the better...","[know, past, better, prepared, future, speak, ...",999,76,111,-0.19,0.19,66,15.1,172,0.17
141,143.0,https://insights.blackcoffer.com/how-political...,Good governance is key to the growth and prog...,"[good, governance, key, growth, progress, coun...",685,63,28,0.38,0.13,38,18.0,117,0.17
129,131.0,https://insights.blackcoffer.com/challenges-an...,To begin with I shall first like to explain w...,"[begin, shall, first, like, explain, big, data...",583,69,56,0.1,0.21,65,9.0,113,0.19


## 2.3. Fog Index

In [161]:
df['Fog_index'] = 0.4 * (df['avg_sentence_len'] + df['complex_words_prop'])

In [162]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index
121,123.0,https://insights.blackcoffer.com/impact-of-cov...,COVID-19: INTRODUCTION The Novel Coronavirus ...,"[covid, introduction, novel, coronavirus, covi...",995,59,86,-0.19,0.15,74,13.4,202,0.2,5.44
64,66.0,https://insights.blackcoffer.com/impacts-of-co...,Some vendors (fruit and vegetable sellers) be...,"[vendor, fruit, vegetable, seller, began, vent...",152,10,9,0.05,0.12,11,13.8,32,0.21,5.604
21,22.0,https://insights.blackcoffer.com/man-and-machi...,Where is this disruptive technology taking us...,"[disruptive, technology, taking, u, take, leav...",653,68,36,0.31,0.16,55,11.9,141,0.22,4.848
59,61.0,https://insights.blackcoffer.com/is-perfection...,What’s perfection really? Does every person e...,"[perfection, really, every, person, expect, pe...",558,77,30,0.44,0.19,76,7.3,56,0.1,2.96
27,28.0,https://insights.blackcoffer.com/how-machines-...,We all hear day in and day out that we amidst...,"[hear, day, day, amidst, technological, revolu...",679,62,40,0.22,0.15,57,11.9,135,0.2,4.84


# 3. Average Number of Words Per Sentence

In [163]:
df['avg_words_per_sentence'] = round(df['num_words']/df['num_sentences'], 2)

In [164]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index,avg_words_per_sentence
132,134.0,https://insights.blackcoffer.com/traceability-...,The democratization of data access and usage ...,"[democratization, data, access, usage, enterpr...",559,31,21,0.19,0.09,43,13.0,130,0.23,5.292,13.0
0,1.0,https://insights.blackcoffer.com/how-is-login-...,When people hear AI they often think about se...,"[people, hear, ai, often, think, sentient, rob...",408,27,19,0.17,0.11,23,17.7,73,0.18,7.152,17.74
56,58.0,https://insights.blackcoffer.com/can-you-be-gr...,The word “leadership” can bring to mind a var...,"[word, leadership, bring, mind, variety, image...",224,42,8,0.68,0.22,16,14.0,44,0.2,5.68,14.0
24,25.0,https://insights.blackcoffer.com/how-machine-l...,Machine learning techniques may have been use...,"[machine, learning, technique, may, used, year...",381,51,21,0.42,0.19,35,10.9,51,0.13,4.412,10.89
28,29.0,https://insights.blackcoffer.com/ai-human-robo...,It’s the year 2060. An automaton in a Researc...,"[year, automaton, research, laboratory, say, s...",725,60,41,0.19,0.14,80,9.1,117,0.16,3.704,9.06


# 4. Complex Word Count

We can see from the data frame, the complex words count is already given in the `df['complex_words]` column.

In [165]:
df.head()

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index,avg_words_per_sentence
0,1.0,https://insights.blackcoffer.com/how-is-login-...,When people hear AI they often think about se...,"[people, hear, ai, often, think, sentient, rob...",408,27,19,0.17,0.11,23,17.7,73,0.18,7.152,17.74
1,2.0,https://insights.blackcoffer.com/how-does-ai-h...,With increasing computing power and more data...,"[increasing, computing, power, data, potential...",375,32,12,0.45,0.12,27,13.9,89,0.24,5.656,13.89
2,3.0,https://insights.blackcoffer.com/ai-and-its-im...,If you were a fan of the 90’s film Clueless b...,"[fan, film, clueless, back, day, remember, pro...",1066,109,41,0.45,0.14,76,14.0,230,0.22,5.688,14.03
3,4.0,https://insights.blackcoffer.com/how-do-deep-l...,"Understanding exactly how data is ingested, a...","[understanding, exactly, data, ingested, analy...",246,17,3,0.7,0.08,14,17.6,64,0.26,7.144,17.57
4,5.0,https://insights.blackcoffer.com/how-artificia...,"From the stone age to the modern world, from ...","[stone, age, modern, world, hunting, gathering...",398,43,20,0.37,0.16,37,10.8,88,0.22,4.408,10.76


# 5. Word Count

We can see from the data frame, the word count is also already given in the `df['num_words']` column.

In [166]:
df.head()

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index,avg_words_per_sentence
0,1.0,https://insights.blackcoffer.com/how-is-login-...,When people hear AI they often think about se...,"[people, hear, ai, often, think, sentient, rob...",408,27,19,0.17,0.11,23,17.7,73,0.18,7.152,17.74
1,2.0,https://insights.blackcoffer.com/how-does-ai-h...,With increasing computing power and more data...,"[increasing, computing, power, data, potential...",375,32,12,0.45,0.12,27,13.9,89,0.24,5.656,13.89
2,3.0,https://insights.blackcoffer.com/ai-and-its-im...,If you were a fan of the 90’s film Clueless b...,"[fan, film, clueless, back, day, remember, pro...",1066,109,41,0.45,0.14,76,14.0,230,0.22,5.688,14.03
3,4.0,https://insights.blackcoffer.com/how-do-deep-l...,"Understanding exactly how data is ingested, a...","[understanding, exactly, data, ingested, analy...",246,17,3,0.7,0.08,14,17.6,64,0.26,7.144,17.57
4,5.0,https://insights.blackcoffer.com/how-artificia...,"From the stone age to the modern world, from ...","[stone, age, modern, world, hunting, gathering...",398,43,20,0.37,0.16,37,10.8,88,0.22,4.408,10.76


# 6. Syllable Count Per Word

In [167]:
df['syl_count'] = df['preprocess_txt'].apply(lambda x: syllables_count(" ".join(x)))

In [168]:
df['syl_per_word'] = df['syl_count']/df['num_words']

In [169]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index,avg_words_per_sentence,syl_count,syl_per_word
108,110.0,https://insights.blackcoffer.com/coronavirus-i...,Before jumping on the topic I would like to g...,"[jumping, topic, would, like, give, overview, ...",270,12,13,-0.04,0.09,21,12.9,46,0.17,5.228,12.86,533,1.974074
22,23.0,https://insights.blackcoffer.com/in-future-or-...,In future or in upcoming years humans and mac...,"[future, upcoming, year, human, machine, going...",403,35,14,0.43,0.12,44,9.2,74,0.18,3.752,9.16,827,2.052109
70,72.0,https://insights.blackcoffer.com/human-rights-...,"Read on to discover what our robot, Athena, h...","[read, discover, robot, athena, found, future,...",72,7,3,0.4,0.14,7,10.3,12,0.17,4.188,10.29,125,1.736111
144,146.0,https://insights.blackcoffer.com/how-artificia...,Today we talk a lot about artificial intellig...,"[today, talk, lot, artificial, intelligence, c...",845,86,33,0.45,0.14,76,11.1,150,0.18,4.512,11.12,1776,2.101775
165,167.0,https://insights.blackcoffer.com/role-big-data...,"Can academia, researchers, decision makers an...","[academia, researcher, decision, maker, policy...",828,74,48,0.21,0.15,71,11.7,143,0.17,4.748,11.66,1752,2.115942


# 7. Personal Pronouns

In [174]:
def personal_pro(text):
  pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
  pronouns = pronounRegex.findall(text)
  return len(pronouns)

`(?-i:us)` is used as _in-line modifier group_ where the matching is CASE SENSITIVE. As a result, this matches only `us` not `US`.

In [175]:
df['personal_pronouns'] = df['Text_Contents'].apply(lambda x: personal_pro(x))

In [176]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index,avg_words_per_sentence,syl_count,syl_per_word,personal_pronouns
117,119.0,https://insights.blackcoffer.com/coronavirus-t...,"Since the creation of the EU, we have been th...","[since, creation, eu, many, challenge, worst, ...",135,11,16,-0.19,0.2,14,9.6,37,0.27,3.948,9.64,268,1.985185,8
127,129.0,https://insights.blackcoffer.com/big-data-anal...,Quality and affordable healthcare is a vision...,"[quality, affordable, healthcare, vision, gove...",651,47,54,-0.07,0.16,65,10.0,118,0.18,4.072,10.02,1374,2.110599,2
99,101.0,https://insights.blackcoffer.com/why-scams-lik...,India has been beset by financial scams since...,"[india, beset, financial, scam, since, indepen...",1288,72,115,-0.23,0.15,87,14.8,214,0.17,5.988,14.8,2589,2.010093,6
156,158.0,https://insights.blackcoffer.com/analytics-hel...,Demand forecasting is the most important phas...,"[demand, forecasting, important, phase, fashio...",397,25,5,0.67,0.08,35,11.3,87,0.22,4.608,11.34,868,2.186398,1
64,66.0,https://insights.blackcoffer.com/impacts-of-co...,Some vendors (fruit and vegetable sellers) be...,"[vendor, fruit, vegetable, seller, began, vent...",152,10,9,0.05,0.12,11,13.8,32,0.21,5.604,13.82,306,2.013158,0


# 8. Average Word Length

In [197]:
def text_len(text):
  text = ''.join(text)
  filtered = ''.join(filter(lambda x: x not in string.punctuation, text))
  words = [word for word in filtered.split() if word]
  avg = sum(map(len, words))/len(words)
  return avg

In [198]:
df['avg_word_len'] = df['Text_Contents'].map(lambda x: text_len(x))

In [200]:
df.sample(n=5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,Fog_index,avg_words_per_sentence,syl_count,syl_per_word,personal_pronouns,avg_word_len
105,107.0,https://insights.blackcoffer.com/how-prepared-...,"Frankly, no! The state of the public healthca...","[frankly, state, public, healthcare, system, s...",454,27,29,-0.04,0.12,32,14.2,89,0.2,5.76,14.19,923,2.03304,3,5.040816
166,168.0,https://insights.blackcoffer.com/sales-forecas...,Inventory planning is a fundamental part of r...,"[inventory, planning, fundamental, part, retai...",437,34,20,0.26,0.12,28,15.6,99,0.23,6.332,15.61,964,2.20595,0,5.628407
82,84.0,https://insights.blackcoffer.com/what-is-the-r...,What is COVID 19 pandemic? On 31st December 2...,"[covid, pandemic, st, december, novel, coronav...",347,21,34,-0.24,0.16,34,10.2,57,0.16,4.144,10.21,670,1.930836,4,5.039216
106,108.0,https://insights.blackcoffer.com/how-will-covi...,"Every prophet of doom, unless he also happens...","[every, prophet, doom, unless, also, happens, ...",745,46,94,-0.34,0.19,55,13.5,137,0.18,5.472,13.55,1527,2.049664,7,5.148823
51,53.0,https://insights.blackcoffer.com/will-we-ever-...,Introduction The definition of consciousness ...,"[introduction, definition, consciousness, cont...",968,50,68,-0.15,0.12,73,13.3,225,0.23,5.412,13.26,2131,2.201446,15,5.364541


# Formatting the Output Data Structure

In [201]:
df.columns

Index(['URL_ID', 'URL', 'Text_Contents', 'preprocess_txt', 'num_words',
       'Positive', 'Negative', 'Polarity', 'Subjectivity', 'num_sentences',
       'avg_sentence_len', 'complex_words', 'complex_words_prop', 'Fog_index',
       'avg_words_per_sentence', 'syl_count', 'syl_per_word',
       'personal_pronouns', 'avg_word_len'],
      dtype='object')

In [204]:
df = df[['URL_ID', 'URL', 'Positive', 'Negative', 'Polarity', 'Subjectivity', 'avg_sentence_len', 'complex_words_prop', 'Fog_index', 'avg_words_per_sentence', 'complex_words', 'num_words', 'syl_per_word', 'personal_pronouns', 'avg_word_len']]

In [206]:
df.columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

In [207]:
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1.0,https://insights.blackcoffer.com/how-is-login-...,27,19,0.17,0.11,17.7,0.18,7.152,17.74,73,408,1.963235,4,5.097983
1,2.0,https://insights.blackcoffer.com/how-does-ai-h...,32,12,0.45,0.12,13.9,0.24,5.656,13.89,89,375,2.101333,2,5.298555
2,3.0,https://insights.blackcoffer.com/ai-and-its-im...,109,41,0.45,0.14,14.0,0.22,5.688,14.03,230,1066,2.095685,13,5.367521
3,4.0,https://insights.blackcoffer.com/how-do-deep-l...,17,3,0.7,0.08,17.6,0.26,7.144,17.57,64,246,2.150407,1,5.299528
4,5.0,https://insights.blackcoffer.com/how-artificia...,43,20,0.37,0.16,10.8,0.22,4.408,10.76,88,398,2.08794,21,4.990085


In [208]:
df.to_csv('Output-Data-Structure.csv', index = False)