### Load files and libraries

In [1]:
import pandas as pd
import ftfy
import random
from fake_useragent import UserAgent
import nltk
import pickle
import pandas as pd
from nltk.corpus import wordnet
import json
from afinn import Afinn
import plotly.express as px
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import ftfy
import humanize
import datetime
import langdetect

In [2]:
articles = pd.read_csv('../Data/articles_v1.csv')

In [3]:
articles

Unnamed: 0,author,date,description,full_text,site_name,title,topic,url,author_article_count,author_name_clean
0,Aja Styles,2020-12-23,Perth mother Clare has found herself mostly co...,Perth mother Clare* has found herself mostly ...,Brisbane Times,'Pack Lego': Perth family caught in hard borde...,entertainment,https://www.brisbanetimes.com.au/national/west...,14,Aja Styles
1,Jake Johnson,2020-12-23,The billâs gifts to the wealthy underscore t...,In late-night votes just hours after nearly 5...,Truthout,Congress Passes COVID Relief With Billions in ...,politics,https://truthout.org/articles/congress-passes-...,33,Jake Johnson
2,Christine Favocci,2020-12-23,It is naive to think that either party is free...,The left has insisted that voter fraud is jus...,The Western Journal,PA Man Facing Charges of Unlawful Voting After...,tech,https://www.westernjournal.com/pa-man-facing-c...,19,Christine Favocci
3,William Rivers Pitt,2020-12-23,What Trump may do in his waning days is only u...,"The endgame being played out by Donald Trump,...",Truthout,What Will Trump Attempt in His Last Days? We M...,politics,https://truthout.org/articles/what-will-trump-...,14,William Rivers Pitt
4,Amy Goodman,2020-12-23,Critics say the $900 billion relief package do...,As Congress passes a $900 billion coronavirus...,Truthout,The Insufficient COVID Stimulus Must Not Be Fo...,business,https://truthout.org/video/the-insufficient-co...,19,Amy Goodman
...,...,...,...,...,...,...,...,...,...,...
119946,Daniel Smith,2020-12-05,Ashley Towne decided to get in shape to marry ...,When you subscribe we will use the informatio...,WalesOnline,'I can't get married looking like this' Rower ...,sport,https://www.walesonline.co.uk/news/uk-news/i-c...,147,Daniel Smith
119947,Victoria Jones,2020-12-05,Experimenting on the ISS allows scientists to ...,When you subscribe we will use the informatio...,WalesOnline,Space experiment could unlock resources for mi...,tech,https://www.walesonline.co.uk/news/uk-news/spa...,92,Victoria Jones
119948,Nisha Mal,2020-12-05,All of the properties are Grade I listed build...,Three buildings featured in a Jane Austen nov...,WalesOnline,Three buildings featured in a Jane Austen nove...,tech,https://www.walesonline.co.uk/news/uk-news/thr...,66,Nisha Mal
119949,Nisha Mal,2020-12-05,"'It's all one big conundrum,' says Sheila Herbert",Woman's home is in Tier 2 while her garden fa...,WalesOnline,Woman's home is in Tier 2 while her garden fal...,tech,https://www.walesonline.co.uk/news/uk-news/wom...,66,Nisha Mal


### Cleaning and preprocessing text columns

In [4]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

In [5]:
# Cleaning the text sentences so that punctuation marks, stop words &amp; digits are removed
# Words are lemmatized according to their POS tags
def clean(doc):
    doc = ftfy.fix_text(doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ""
    for word in punc_free.split():
        tag = nltk.pos_tag([word])[0][1][0].upper()
        normalized = normalized + " " + str(lemma.lemmatize(word, tag_dict.get(tag, wordnet.NOUN)))
    processed = re.sub(r"\d+","", normalized)
    y = processed.split()
    return ' '.join(y)

### Cleaning article text (title x4, description x2, full_text x1)

In [6]:
articles = articles.fillna('-')

In [7]:
articles['content'] = ((articles['title'] + ' ') * 4) + ((articles['description'] + ' ') * 2) + (articles['full_text'])
articles['content'] = articles.apply(lambda x: clean(x['content']), axis = 1)

In [8]:
articles

Unnamed: 0,author,date,description,full_text,site_name,title,topic,url,author_article_count,author_name_clean,content
0,Aja Styles,2020-12-23,Perth mother Clare has found herself mostly co...,Perth mother Clare* has found herself mostly ...,Brisbane Times,'Pack Lego': Perth family caught in hard borde...,entertainment,https://www.brisbanetimes.com.au/national/west...,14,Aja Styles,pack lego perth family caught hard border cros...
1,Jake Johnson,2020-12-23,The billâs gifts to the wealthy underscore t...,In late-night votes just hours after nearly 5...,Truthout,Congress Passes COVID Relief With Billions in ...,politics,https://truthout.org/articles/congress-passes-...,33,Jake Johnson,congress pass covid relief billion handout wea...
2,Christine Favocci,2020-12-23,It is naive to think that either party is free...,The left has insisted that voter fraud is jus...,The Western Journal,PA Man Facing Charges of Unlawful Voting After...,tech,https://www.westernjournal.com/pa-man-facing-c...,19,Christine Favocci,pa man face charge unlawful voting allegedly c...
3,William Rivers Pitt,2020-12-23,What Trump may do in his waning days is only u...,"The endgame being played out by Donald Trump,...",Truthout,What Will Trump Attempt in His Last Days? We M...,politics,https://truthout.org/articles/what-will-trump-...,14,William Rivers Pitt,trump attempt last day must prepare anything t...
4,Amy Goodman,2020-12-23,Critics say the $900 billion relief package do...,As Congress passes a $900 billion coronavirus...,Truthout,The Insufficient COVID Stimulus Must Not Be Fo...,business,https://truthout.org/video/the-insufficient-co...,19,Amy Goodman,insufficient covid stimulus must follow auster...
...,...,...,...,...,...,...,...,...,...,...,...
119946,Daniel Smith,2020-12-05,Ashley Towne decided to get in shape to marry ...,When you subscribe we will use the informatio...,WalesOnline,'I can't get married looking like this' Rower ...,sport,https://www.walesonline.co.uk/news/uk-news/i-c...,147,Daniel Smith,i cant get married look like this rower whose ...
119947,Victoria Jones,2020-12-05,Experimenting on the ISS allows scientists to ...,When you subscribe we will use the informatio...,WalesOnline,Space experiment could unlock resources for mi...,tech,https://www.walesonline.co.uk/news/uk-news/spa...,92,Victoria Jones,space experiment could unlock resource mission...
119948,Nisha Mal,2020-12-05,All of the properties are Grade I listed build...,Three buildings featured in a Jane Austen nov...,WalesOnline,Three buildings featured in a Jane Austen nove...,tech,https://www.walesonline.co.uk/news/uk-news/thr...,66,Nisha Mal,three building feature jane austen novel sell ...
119949,Nisha Mal,2020-12-05,"'It's all one big conundrum,' says Sheila Herbert",Woman's home is in Tier 2 while her garden fa...,WalesOnline,Woman's home is in Tier 2 while her garden fal...,tech,https://www.walesonline.co.uk/news/uk-news/wom...,66,Nisha Mal,woman home tier garden fall tier bizarre lockd...


### Calculating Afinn sentiment for all articles

In [10]:
afinn = Afinn(language='en')
def get_article_sentiment(text):
    return round(afinn.score(text) / len(text.split()) * 100, 2)

In [11]:
articles['article_sentiment'] = articles.apply(lambda x: get_article_sentiment(x['full_text']), axis=1)

In [12]:
articles

Unnamed: 0,author,date,description,full_text,site_name,title,topic,url,author_article_count,author_name_clean,content,article_sentiment
0,Aja Styles,2020-12-23,Perth mother Clare has found herself mostly co...,Perth mother Clare* has found herself mostly ...,Brisbane Times,'Pack Lego': Perth family caught in hard borde...,entertainment,https://www.brisbanetimes.com.au/national/west...,14,Aja Styles,pack lego perth family caught hard border cros...,0.00
1,Jake Johnson,2020-12-23,The billâs gifts to the wealthy underscore t...,In late-night votes just hours after nearly 5...,Truthout,Congress Passes COVID Relief With Billions in ...,politics,https://truthout.org/articles/congress-passes-...,33,Jake Johnson,congress pass covid relief billion handout wea...,-2.12
2,Christine Favocci,2020-12-23,It is naive to think that either party is free...,The left has insisted that voter fraud is jus...,The Western Journal,PA Man Facing Charges of Unlawful Voting After...,tech,https://www.westernjournal.com/pa-man-facing-c...,19,Christine Favocci,pa man face charge unlawful voting allegedly c...,-16.20
3,William Rivers Pitt,2020-12-23,What Trump may do in his waning days is only u...,"The endgame being played out by Donald Trump,...",Truthout,What Will Trump Attempt in His Last Days? We M...,politics,https://truthout.org/articles/what-will-trump-...,14,William Rivers Pitt,trump attempt last day must prepare anything t...,-7.61
4,Amy Goodman,2020-12-23,Critics say the $900 billion relief package do...,As Congress passes a $900 billion coronavirus...,Truthout,The Insufficient COVID Stimulus Must Not Be Fo...,business,https://truthout.org/video/the-insufficient-co...,19,Amy Goodman,insufficient covid stimulus must follow auster...,-0.56
...,...,...,...,...,...,...,...,...,...,...,...,...
119946,Daniel Smith,2020-12-05,Ashley Towne decided to get in shape to marry ...,When you subscribe we will use the informatio...,WalesOnline,'I can't get married looking like this' Rower ...,sport,https://www.walesonline.co.uk/news/uk-news/i-c...,147,Daniel Smith,i cant get married look like this rower whose ...,3.09
119947,Victoria Jones,2020-12-05,Experimenting on the ISS allows scientists to ...,When you subscribe we will use the informatio...,WalesOnline,Space experiment could unlock resources for mi...,tech,https://www.walesonline.co.uk/news/uk-news/spa...,92,Victoria Jones,space experiment could unlock resource mission...,6.03
119948,Nisha Mal,2020-12-05,All of the properties are Grade I listed build...,Three buildings featured in a Jane Austen nov...,WalesOnline,Three buildings featured in a Jane Austen nove...,tech,https://www.walesonline.co.uk/news/uk-news/thr...,66,Nisha Mal,three building feature jane austen novel sell ...,3.33
119949,Nisha Mal,2020-12-05,"'It's all one big conundrum,' says Sheila Herbert",Woman's home is in Tier 2 while her garden fa...,WalesOnline,Woman's home is in Tier 2 while her garden fal...,tech,https://www.walesonline.co.uk/news/uk-news/wom...,66,Nisha Mal,woman home tier garden fall tier bizarre lockd...,-2.21


In [13]:
articles.describe()

Unnamed: 0,author_article_count,article_sentiment
count,119951.0,119951.0
mean,59.786238,1.661658
std,59.939789,6.624018
min,10.0,-47.76
25%,22.0,-1.85
50%,44.0,1.89
75%,80.0,5.58
max,523.0,77.78


In [14]:
articles.to_csv('../Data/articles_v2.csv')

### Cleaing IAB classifier text

In [15]:
iab_text = pd.read_csv('../Data/IAB_text_v1.csv')

In [16]:
iab_text['with_bs4_clean'] = iab_text.apply(lambda x: clean(x['with_bs4']), axis=1)

In [17]:
iab_text['with_justext_clean'] = iab_text.apply(lambda x: clean(x['with_justext']), axis=1)

In [18]:
iab_text

Unnamed: 0,Unique ID,Parent,Name,Position,URL,Title,Snippet,Tier 1,Tier 2,Tier 3,Tier 4,with_bs4,with_justext,with_bs4_clean,with_justext_clean
0,1,-,Automotive,1,https://www.merriam-webster.com/dictionary/aut...,Automotive | Definition of Automotive by …,Automotive definition is - self-propelled. How...,Automotive,-,-,-,Automotive | Defin...,These example sentences are selected automat...,automotive definition automotive merriamwebste...,example sentence select automatically various ...
1,1,-,Automotive,2,https://nl.wikipedia.org/wiki/Automotive,Automotive - Wikipedia,"Dit is een doorverwijspagina, bedoeld om de ve...",Automotive,-,-,-,Automotive - Wikipedia ...,-,automotive wikipedia automotive uit wikipedia ...,
2,1,-,Automotive,3,https://www.dictionary.com/browse/automotive,Automotive | Definition of Automotive at …,"Automotive definition, pertaining to the desig...",Automotive,-,-,-,Automotive | Definition of Automotive at ...,General Motors warned that a global semicond...,automotive definition automotive dictionarycom...,general motor warn global semiconductor shorta...
3,1,-,Automotive,4,https://en.wikipedia.org/wiki/Automotive_industry,Automotive industry - Wikipedia,The automotive industry comprises a wide range...,Automotive,-,-,-,Automotive industry - Wikipedia ...,The automotive industry began in the 1860s w...,automotive industry wikipedia automotive indus...,automotive industry begin s hundred manufactur...
4,1,-,Automotive,5,https://automotive-online.nl/management/laatst...,Alle nieuwsrubrieken - Automotive Online,Het platform voor het midden en hoger manageme...,Automotive,-,-,-,Alle nieuwsrubrieken - Automotiv...,-,alle nieuwsrubrieken automotive online automot...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11963,1480,1219,City,6,https://www.britannica.com/place/New-York-City,"New York City | Layout, People, Economy, Cultu...","New York City, officially the City of New York...",Content Source Geo,City,-,-,New York City | L...,Alternative Titles: New Amsterdam New Orange...,new york city layout people economy culture hi...,alternative title new amsterdam new orange new...
11964,1480,1219,City,7,https://www.citytv.com/toronto/,City TV,Watch full episodes for free and see the TV sc...,Content Source Geo,City,-,-,City TV You need to enable JavaScript to run t...,-,city tv need enable javascript run app,
11965,1480,1219,City,8,https://www.theguardian.com/football/mancheste...,Manchester City | Football | The Guardian,Manchester City hit a record 15-game winning s...,Content Source Geo,City,-,-,Manchester City | Football | The Guardian...,-,manchester city football guardian skip main co...,
11966,1480,1219,City,9,https://www.hbo.com/sex-and-the-city,Sex and the City - Official Website for the HB...,The official website for Sex and the City on H...,Content Source Geo,City,-,-,Sex and the City - Official Website for the HB...,Sarah Jessica Parker stars as Carrie Bradsha...,sex city official website hbo serieslogoseries...,sarah jessica parker star carrie bradshaw new ...


In [19]:
# iab_text.to_csv('../Data/IAB_text_v2.csv', index = False)

In [20]:
iab_text = pd.read_csv('../Data/IAB_text_v2.csv')

In [21]:
regex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)

In [22]:
def remove_emojis(text):
    try:
        return regex_pattern.sub(r'',text)
    except:
        return np.nan

In [23]:
iab_text['with_bs4_clean'] = iab_text.apply(lambda x: remove_emojis(x['with_bs4_clean']), axis = 1)

In [25]:
iab_text['with_justext_clean'] = iab_text.apply(lambda x: remove_emojis(x['with_justext_clean']), axis = 1)

In [26]:
for i in range(len(iab_text)):
    try:
        text = iab_text.iloc[i].with_bs4
        lang = langdetect.detect(text)
        if lang != 'en':
            iab_text.loc[i, 'with_bs4_clean'] = np.nan
    except Exception as e:
        iab_text.loc[i, 'with_bs4_clean'] = np.nan

In [27]:
iab_text.dropna(subset = ['with_bs4_clean'])

Unnamed: 0,Unique ID,Parent,Name,Position,URL,Title,Snippet,Tier 1,Tier 2,Tier 3,Tier 4,with_bs4,with_justext,with_bs4_clean,with_justext_clean
0,1,-,Automotive,1,https://www.merriam-webster.com/dictionary/aut...,Automotive | Definition of Automotive by …,Automotive definition is - self-propelled. How...,Automotive,-,-,-,Automotive | Defin...,These example sentences are selected automat...,automotive definition automotive merriamwebste...,example sentence select automatically various ...
2,1,-,Automotive,3,https://www.dictionary.com/browse/automotive,Automotive | Definition of Automotive at …,"Automotive definition, pertaining to the desig...",Automotive,-,-,-,Automotive | Definition of Automotive at ...,General Motors warned that a global semicond...,automotive definition automotive dictionarycom...,general motor warn global semiconductor shorta...
3,1,-,Automotive,4,https://en.wikipedia.org/wiki/Automotive_industry,Automotive industry - Wikipedia,The automotive industry comprises a wide range...,Automotive,-,-,-,Automotive industry - Wikipedia ...,The automotive industry began in the 1860s w...,automotive industry wikipedia automotive indus...,automotive industry begin s hundred manufactur...
10,2,1,Auto Body Styles,1,https://www.motor1.com/body-styles/,Body Styles Guide | Motor1.com,"Find news, reviews and cars for sale broken do...",Automotive,Auto Body Styles,-,-,Body Styles Guide | Motor1.com ...,-,body style guide motorcom news review feature ...,
11,2,1,Auto Body Styles,2,https://www.liveabout.com/pickup-trucks-body-s...,Pickup Trucks - Truck Body Styles Explained,· Most auto manufacturers call their traditio...,Automotive,Auto Body Styles,-,-,Pickup Trucks - Truck Body Sty...,-,pickup truck truck body style explain menu hom...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11963,1480,1219,City,6,https://www.britannica.com/place/New-York-City,"New York City | Layout, People, Economy, Cultu...","New York City, officially the City of New York...",Content Source Geo,City,-,-,New York City | L...,Alternative Titles: New Amsterdam New Orange...,new york city layout people economy culture hi...,alternative title new amsterdam new orange new...
11964,1480,1219,City,7,https://www.citytv.com/toronto/,City TV,Watch full episodes for free and see the TV sc...,Content Source Geo,City,-,-,City TV You need to enable JavaScript to run t...,-,city tv need enable javascript run app,
11965,1480,1219,City,8,https://www.theguardian.com/football/mancheste...,Manchester City | Football | The Guardian,Manchester City hit a record 15-game winning s...,Content Source Geo,City,-,-,Manchester City | Football | The Guardian...,-,manchester city football guardian skip main co...,
11966,1480,1219,City,9,https://www.hbo.com/sex-and-the-city,Sex and the City - Official Website for the HB...,The official website for Sex and the City on H...,Content Source Geo,City,-,-,Sex and the City - Official Website for the HB...,Sarah Jessica Parker stars as Carrie Bradsha...,sex city official website hbo serieslogoseries...,sarah jessica parker star carrie bradshaw new ...


In [28]:
for i in range(len(iab_text)):
    try:
        text = iab_text.iloc[i].with_justext
        lang = langdetect.detect(text)
        if lang != 'en':
            iab_text.loc[i, 'with_justext_clean'] = np.nan
    except Exception as e:
        iab_text.loc[i, 'with_justext_clean'] = np.nan

In [29]:
iab_text.dropna(subset = ['with_justext_clean'])

Unnamed: 0,Unique ID,Parent,Name,Position,URL,Title,Snippet,Tier 1,Tier 2,Tier 3,Tier 4,with_bs4,with_justext,with_bs4_clean,with_justext_clean
0,1,-,Automotive,1,https://www.merriam-webster.com/dictionary/aut...,Automotive | Definition of Automotive by …,Automotive definition is - self-propelled. How...,Automotive,-,-,-,Automotive | Defin...,These example sentences are selected automat...,automotive definition automotive merriamwebste...,example sentence select automatically various ...
2,1,-,Automotive,3,https://www.dictionary.com/browse/automotive,Automotive | Definition of Automotive at …,"Automotive definition, pertaining to the desig...",Automotive,-,-,-,Automotive | Definition of Automotive at ...,General Motors warned that a global semicond...,automotive definition automotive dictionarycom...,general motor warn global semiconductor shorta...
3,1,-,Automotive,4,https://en.wikipedia.org/wiki/Automotive_industry,Automotive industry - Wikipedia,The automotive industry comprises a wide range...,Automotive,-,-,-,Automotive industry - Wikipedia ...,The automotive industry began in the 1860s w...,automotive industry wikipedia automotive indus...,automotive industry begin s hundred manufactur...
12,2,1,Auto Body Styles,3,https://www.autoevolution.com/news/2021-dacia-...,"2021 Dacia Logan Reimagined With Coupe, Pickup, …","As for the Maximum Capacity Vehicle, cargo vol...",Automotive,Auto Body Styles,-,-,2021 Dacia Logan Reimagined With Coupe P...,2021 Dacia Logan Reimagined With Coupe Picku...,dacia logan reimagined coupe pickup mcv body s...,dacia logan reimagined coupe pickup mcv body s...
15,2,1,Auto Body Styles,6,https://autopartsfair.com/exterior_parts/,Auto Body Parts Store - Exterior Body Parts for …,"Choose, compare & buy from a wide range of aut...",Automotive,Auto Body Styles,-,-,Auto Body Parts Store - Exterior Body Parts ...,Discount Used Auto Parts Store This website...,auto body part store exterior body part car tr...,discount use auto part store website entry bes...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11958,1480,1219,City,1,https://www.dictionary.com/browse/city,City | Definition of City at Dictionary.com,A city is a place where a large number of peop...,Content Source Geo,City,-,-,City | Definition of City at Dictionary.c...,VOCAB BUILDER What is a basic definition of...,city definition city dictionarycom dictionaryc...,vocab builder basic definition city city place...
11959,1480,1219,City,2,https://indianexpress.com/section/cities/,"City News, Indian City Headlines, Latest City ...","Ahead of V-Day, city cops asked to increase pa...",Content Source Geo,City,-,-,City News Indian City Headlines Latest ...,Kachhadiya told The Indian Express “The moti...,city news indian city headline late city news ...,kachhadiya told indian express the motive behi...
11961,1480,1219,City,4,http://smartcitydehradun.uk.gov.in/,"SmartCity, Dehradun",GoI under Smart City Mission Phase-4 100 citie...,Content Source Geo,City,-,-,SmartCity Dehradun ...,CLOCK TOWER - Dehradun Clock tower also kno...,smartcity dehradun dscl dscl board director or...,clock tower dehradun clock tower also know gha...
11963,1480,1219,City,6,https://www.britannica.com/place/New-York-City,"New York City | Layout, People, Economy, Cultu...","New York City, officially the City of New York...",Content Source Geo,City,-,-,New York City | L...,Alternative Titles: New Amsterdam New Orange...,new york city layout people economy culture hi...,alternative title new amsterdam new orange new...


In [30]:
iab_text.to_csv("../Data/IAB_text_v3.csv", index = False)