# News Recommender

## Install requirements and import packages

In [57]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [58]:
import os
import math
import time
import numpy as np
import pandas as pd
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

from tqdm import tqdm

## Load Data

In [59]:
news_data = pd.read_json("News_Category_Dataset_v2.json", lines=True)

In [60]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
authors              200853 non-null object
category             200853 non-null object
date                 200853 non-null datetime64[ns]
headline             200853 non-null object
link                 200853 non-null object
short_description    200853 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [61]:
news_data.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,"There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV",https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89,She left her husband. He killed their children. Just another day in America.
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song,https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-grant-marries_us_5b09212ce4b0568a880b9a8c,The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork,https://www.huffingtonpost.com/entry/jim-carrey-adam-schiff-democrats_us_5b0950e8e4b0fdb2aa53e675,The actor gives Dems an ass-kicking for not fighting hard enough against Donald Trump.
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog,https://www.huffingtonpost.com/entry/julianna-margulies-trump-poop-bag_us_5b093ec2e4b0fdb2aa53df70,"The ""Dietland"" actress said using the bags is a ""really cathartic, therapeutic moment."""


## Preprocessing

In [62]:
news_data.shape

(200853, 6)

### Checking article headline word lengths

In [63]:
hl_len = defaultdict(int)

for h in news_data['headline']:
    hl_len[len(h.split())] += 1

In [64]:
for k in sorted(hl_len):
    print('{}:{}'.format(k, hl_len[k]))

0:6
1:256
2:1428
3:3332
4:6068
5:9220
6:13183
7:17168
8:21721
9:25259
10:26682
11:24716
12:19607
13:13688
14:8415
15:4910
16:2631
17:1255
18:626
19:296
20:172
21:95
22:50
23:24
24:15
25:6
26:6
27:6
28:6
29:1
30:1
31:1
34:1
38:1
44:1


In [65]:
# Add Graph?

In [66]:
#Retaining articles with headline word lengths > 5
print('Total Articles before removal of short title articles:', news_data.shape[0])
news_data = news_data[news_data['headline'].apply(lambda x: len(x.split()) > 5)]
print('Total Articles after removal of short title articles:', news_data.shape[0])

Total Articles before removal of short title articles: 200853
Total Articles after removal of short title articles: 180543


### Check and remove duplicates

In [67]:
news_data

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,"There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV",https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89,She left her husband. He killed their children. Just another day in America.
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song,https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-grant-marries_us_5b09212ce4b0568a880b9a8c,The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork,https://www.huffingtonpost.com/entry/jim-carrey-adam-schiff-democrats_us_5b0950e8e4b0fdb2aa53e675,The actor gives Dems an ass-kicking for not fighting hard enough against Donald Trump.
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog,https://www.huffingtonpost.com/entry/julianna-margulies-trump-poop-bag_us_5b093ec2e4b0fdb2aa53df70,"The ""Dietland"" actress said using the bags is a ""really cathartic, therapeutic moment."""
5,Ron Dicker,ENTERTAINMENT,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harassment Claims Could Undermine Legacy,https://www.huffingtonpost.com/entry/morgan-freeman-devastated-sexual-misconduct_us_5b096319e4b0802d69cba298,"""It is not right to equate horrific incidents of sexual assault with misplaced compliments or humor,"" he said in a statement."
6,Ron Dicker,ENTERTAINMENT,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle In 'Tonight Show' Bit,https://www.huffingtonpost.com/entry/donald-trump-mcondalds-tonight-show_us_5b093561e4b0fdb2aa53daba,"It's catchy, all right."
7,Todd Van Luling,ENTERTAINMENT,2018-05-26,What To Watch On Amazon Prime That’s New This Week,https://www.huffingtonpost.com/entry/amazon-prime-what-to-watch_us_5b044625e4b0c0b8b23ec14f,There's a great mini-series joining this week.
8,Andy McDonald,ENTERTAINMENT,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth Austin Powers Film,https://www.huffingtonpost.com/entry/mike-myers-reveals-he-wants-to-do-a-fourth-austin-powers-film_us_5b096198e4b0802d69cb9f15,"Myer's kids may be pushing for a new ""Powers"" film more than anyone."
9,Todd Van Luling,ENTERTAINMENT,2018-05-26,What To Watch On Hulu That’s New This Week,https://www.huffingtonpost.com/entry/hulu-what-to-watch_us_5b0445bae4b0c0b8b23ec046,You're getting a recent Academy Award-winning movie.


In [68]:
news_data.sort_values('headline', inplace=True, ascending=False)
news_data

Unnamed: 0,authors,category,date,headline,link,short_description
36290,"Darin Graham, ContributorJournalist",WOMEN,2017-01-25,"“We Shall Overcomb!” Say The 100,000 Marching Against Trump In London",https://www.huffingtonpost.com/entry/we-shall-overcomb-say-the-100000-marching-against_us_5888bf29e4b05a82fd5b30b3,"Thousands of activists descended on London to march for women’s rights, diversity and equality last Saturday. The Women’s"
21194,"Mycah Hazel, Contributorblogger, equal opportunity enthusiast",HEALTHY LIVING,2017-07-18,“To The Bone” Didn’t Teach Me Glamour. It Taught Me Respect.,https://www.huffingtonpost.com/entry/to-the-bone-didnt-teach-me-glamour-it-taught-me_us_596d50afe4b05561da5a5a0c,"Oftentimes, films or TV shows about eating disorders try to convince victims to stop by showing them their sickly thin appearance"
29672,"Dana Brownlee, ContributorPresident of Professionalism Matters, a corporate training/key...",BUSINESS,2017-04-10,"“I’m Sorry""--The Two Tragically Forgotten Words In Customer Service",https://www.huffingtonpost.com/entry/im-sorrythe-two-tragically-forgotten-words-in-customer_us_58eadad7e4b00dd8e016ed62,"Unfortunately, I was one of those frustrated passengers caught in the midst of the recent spring break travel nightmare that"
199048,,DIVORCE,2012-02-16,‘Your Divorce Ruined My Life' What To Do When Your Child Blames You,https://www.huffingtonpost.comhttp://www.theglobeandmail.com/life/parenting/advice/anthony-e-wolf/your-divorce-ruined-my-life-what-to-do-when-your-child-blames-you/article2340805/?utm_medium=Feeds:%20RSS/Atom&utm_source=Home&utm_content=2340805,It was Sunday night and Lucas’s mother had had it with her 15-year-old son. “I am just sick of this. Look at this mess. I
193783,,DIVORCE,2012-04-13,"‘You Better Sit Down,' By The Civilians, At Flea Theater",https://www.huffingtonpost.comhttp://theater.nytimes.com/2012/04/13/theater/reviews/you-better-sit-down-by-the-civilians-at-flea-theater.html,"The Civilians, the enterprising troupe specializing in documentary theater drawn from interviews, takes its tape recorders"
112446,,WOMEN,2014-09-09,‘Yes' Is Better Than ‘No' When It Comes To Consensual Sex On Campus,https://www.huffingtonpost.com/entry/michael-kimmel-and-gloria_n_5790164.html,
123546,,WEIRD NEWS,2014-05-05,‘Worst Mom In The World' Selfies,https://www.huffingtonpost.com/entry/worst-mom-selfie_n_5268445.html,
2932,Elyse Wanshel,QUEER VOICES,2018-04-02,‘Will & Grace’ Creator To Donate Gay Bunny Book To Every Grade School In Indiana,https://www.huffingtonpost.com/entry/will-grace-creator-donate-john-olivers-gay-bunny-book-to-every-elementary-school-in-indiana_us_5ac28265e4b00fa46f854225,It's about to be a lot easier for kids in Mike Pence's home state to read “A Day in the Life of Marlon Bundo.”
67601,Nina Golgowski,WEIRD NEWS,2016-02-03,‘Wild Boar Curling’ Rescues Stranded Wild Boars From Frozen Lake,https://www.huffingtonpost.com/entry/wild-boar-curling-rescues-boars_us_56b25c45e4b04f9b57d82cd8,Get this pig in a blanket!
85380,Lilly Workneh,BLACK VOICES,2015-07-17,‘We’re Never Gonna Forget’: Eric Garner’s Family Reflects On His Death One Year Later,https://www.huffingtonpost.com/entry/were-never-gonna-forget-eric-garners-family-reflects-on-his-death-one-year-later_us_55a91617e4b04740a3dfb353,Eric Garner's family share memories of the family man one year after his death.


In [69]:
duplicates = news_data.duplicated('headline', keep=False)
duplicates

36290     False
21194     False
29672     False
199048    False
193783    False
112446    False
123546    False
2932      False
67601     False
85380     False
25186     False
26210     False
20973     False
26939     False
15705     False
83968     False
35588     False
16236     True 
14817     True 
74066     False
39284     False
10348     False
19088     False
10318     False
66764     False
85147     False
46537     False
9375      False
4487      False
12525     False
          ...  
79194     False
184553    False
127174    False
93360     False
170481    False
180013    False
25151     False
118713    False
137976    False
120489    False
106321    False
184863    False
60589     False
68627     False
91213     False
115463    False
135037    False
50799     False
147824    False
193086    False
40225     False
120801    False
57464     False
197915    False
151118    False
146670    False
110111    False
194610    False
130009    False
149150    False
Length: 180543, dtype: b

In [70]:
print('Total Articles before removal of duplicate title articles:', news_data.shape[0])
news_data = news_data[~duplicates]
print('Total Articles after removal of duplicate title articles:', news_data.shape[0])

Total Articles before removal of duplicate title articles: 180543
Total Articles after removal of duplicate title articles: 178760


### Checking for missing values

In [71]:
news_data.isna().sum()

authors              0
category             0
date                 0
headline             0
link                 0
short_description    0
dtype: int64

## Data Exploration

In [72]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178760 entries, 36290 to 149150
Data columns (total 6 columns):
authors              178760 non-null object
category             178760 non-null object
date                 178760 non-null datetime64[ns]
headline             178760 non-null object
link                 178760 non-null object
short_description    178760 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 9.5+ MB


In [73]:
news_data.shape

(178760, 6)

In [74]:
print('Total number of Articles:', news_data.shape[0])

Total number of Articles: 178760


In [75]:
print('Total number of unique authors:', news_data['authors'].nunique())

Total number of unique authors: 24589


In [76]:
print('Total number of unique categories:', news_data['category'].nunique())

Total number of unique categories: 41


### Grouping articles category-wise

In [77]:
# replace with Matplotlib?
fig = go.Figure([go.Bar(x=news_data["category"].value_counts().index, y=news_data["category"].value_counts().values)])
fig['layout'].update(title={"text" : 'Distribution of articles category-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Category name",yaxis_title="Number of articles")
fig.update_layout(width=800,height=700)
fig

From the bar chart, is is clear that **Politics** category has the highest nubmer of articles.

### No. of articles per month

In [78]:
# needed?

### Probability Distribution Function of length of headlines

In [79]:
fig = ff.create_distplot(
            [news_data['headline'].str.len()],
            ['ht'],
            show_hist=False,
            show_rug=False
        )
fig['layout'].update(
        title={
            'text': 'PDF',
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center', 
            'yanchor': 'top'
        },
        xaxis_title='Length of a headline',
        yaxis_title='probability'
        )
fig.update_layout(showlegend=False, width=500, height=500)
fig

In [80]:
# Making indices uniform ranging from 0
news_data.index = range(news_data.shape[0])

In [81]:
# Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
news_data["day and month"] = news_data["date"].dt.strftime("%a") + "_" + news_data["date"].dt.strftime("%b")

In [82]:
# To retain original headlines, make a copy of data before preprocessing
news_data_orig = news_data.copy()

## Preprocessing

### Removing stopwords

In [83]:
# Using stop words from nltk package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
stop_words = set(stopwords.words('english'))

In [85]:
news_data.head()

Unnamed: 0,authors,category,date,headline,link,short_description,day and month
0,"Darin Graham, ContributorJournalist",WOMEN,2017-01-25,"“We Shall Overcomb!” Say The 100,000 Marching Against Trump In London",https://www.huffingtonpost.com/entry/we-shall-overcomb-say-the-100000-marching-against_us_5888bf29e4b05a82fd5b30b3,"Thousands of activists descended on London to march for women’s rights, diversity and equality last Saturday. The Women’s",Wed_Jan
1,"Mycah Hazel, Contributorblogger, equal opportunity enthusiast",HEALTHY LIVING,2017-07-18,“To The Bone” Didn’t Teach Me Glamour. It Taught Me Respect.,https://www.huffingtonpost.com/entry/to-the-bone-didnt-teach-me-glamour-it-taught-me_us_596d50afe4b05561da5a5a0c,"Oftentimes, films or TV shows about eating disorders try to convince victims to stop by showing them their sickly thin appearance",Tue_Jul
2,"Dana Brownlee, ContributorPresident of Professionalism Matters, a corporate training/key...",BUSINESS,2017-04-10,"“I’m Sorry""--The Two Tragically Forgotten Words In Customer Service",https://www.huffingtonpost.com/entry/im-sorrythe-two-tragically-forgotten-words-in-customer_us_58eadad7e4b00dd8e016ed62,"Unfortunately, I was one of those frustrated passengers caught in the midst of the recent spring break travel nightmare that",Mon_Apr
3,,DIVORCE,2012-02-16,‘Your Divorce Ruined My Life' What To Do When Your Child Blames You,https://www.huffingtonpost.comhttp://www.theglobeandmail.com/life/parenting/advice/anthony-e-wolf/your-divorce-ruined-my-life-what-to-do-when-your-child-blames-you/article2340805/?utm_medium=Feeds:%20RSS/Atom&utm_source=Home&utm_content=2340805,It was Sunday night and Lucas’s mother had had it with her 15-year-old son. “I am just sick of this. Look at this mess. I,Thu_Feb
4,,DIVORCE,2012-04-13,"‘You Better Sit Down,' By The Civilians, At Flea Theater",https://www.huffingtonpost.comhttp://theater.nytimes.com/2012/04/13/theater/reviews/you-better-sit-down-by-the-civilians-at-flea-theater.html,"The Civilians, the enterprising troupe specializing in documentary theater drawn from interviews, takes its tape recorders",Fri_Apr


In [86]:
for i in tqdm(range(len(news_data['headline']))):
    buffer = ''
#     print(i, ":", news_data['headline'][i])
    for word in news_data['headline'][i].split():
        word = (''.join((c for c in word if c.isalnum())))
        word = word.lower()
        if word not in stop_words:
            buffer += word + ' '
    buffer = buffer.strip()
    news_data.at[i, 'headline'] = buffer

100%|██████████| 178760/178760 [00:06<00:00, 27462.78it/s]


### Lemmatization

In [87]:
lemmatizer = WordNetLemmatizer()

In [88]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [89]:
for i in tqdm(range(len(news_data['headline']))):
    buffer = ''
    for w in word_tokenize(news_data['headline'][i]):
        buffer += lemmatizer.lemmatize(w, pos = 'v') + ' '
    news_data.at[i, 'headline'] = buffer.strip()

100%|██████████| 178760/178760 [00:30<00:00, 5878.11it/s]


## Recommendation

In order to measure similarity, we need to convert text into a vector:

### 1. Vector using Bag of Words: 
A **Bag of Words (BoW)** method represents the occurence of words within a document. Here, each headline can be considered as a document and set of all headlines form a corpus. Using BoW approach, each document is represented by a _d-dimensional vector_, where d is total number of unique words in the corpus. The set of such unique words forms the Vocabulary.

In [90]:
headline_vectorizer = CountVectorizer()
headline_features = headline_vectorizer.fit_transform(news_data['headline'])

In [91]:
headline_features.get_shape()

(178760, 56596)

In [92]:
headline_features

<178760x56596 sparse matrix of type '<class 'numpy.int64'>'
	with 1252140 stored elements in Compressed Sparse Row format>

In [93]:
pd.set_option('display.max_colwidth', -1) # to display long headlines completely

In [94]:
def bow_model(row_index, num_similar_items):
    dist = pairwise_distances(headline_features, headline_features[row_index])
    indices = np.argsort(dist.ravel())[0:num_similar_items] # Flattening array
    df = pd.DataFrame(
        {'publish_date': news_data_orig['date'][indices].values,
         'headline' : news_data_orig['headline'][indices].values,
         'Similarity': dist[indices].ravel()
        }
    )
    print('Read article: \n\n{}\n'.format(news_data_orig['headline'][indices[0]]))
    print('Recommended Articles:')
    return df.iloc[1:,]

#### Results of Bag of Words

Below function recommends similar articles to the _read_ article based on the headline. It accepts two arguments _(i, n)_
- _i_: index of already _read_ article 
- _n_: number of articles to be recommended 

Based on the Euclidean distance it finds **n** nearest neighbors and recommends those.

In [95]:
bow_model(133, 11) # Change row for any other queried article

Read article: 

‘Game Of Thrones’ Character Might’ve Really Hinted At Highly Anticipated Grudge Match

Recommended Articles:


Unnamed: 0,publish_date,headline,Similarity
1,2015-05-04,Did 'Game Of Thrones' Just Kill Off Two Characters?,3.0
2,2015-06-15,Is That 'Game Of Thrones' Character Actually Dead?,3.0
3,2013-07-08,"It's Really Not You, It's Them",3.0
4,2016-04-03,"'Game Of Thrones' Is All About This Character, According To Math",3.0
5,2016-06-06,'Game Of Thrones' Might've Spoiled Its Own Cliffhanger,3.0
6,2014-06-16,The Tao of Game of Thrones,3.0
7,2012-04-01,To Be or Not to Be,3.162278
8,2015-01-12,What Being A Mother Is Really About,3.162278
9,2016-04-25,'Game Of Thrones' After Show Episode 1,3.162278
10,2015-06-15,Why Being Judgmental Is Really All About Me,3.162278


#### Disadvantages:

1. It gives very low importance to less frequently observed words in the corpus. Few words from the read article like _grudge_ appear less frequently in the entire corpus so **BoW** method does not recommend any article whose headline contains these words. Since _game of thrones_ is commonly observed in the corpus so it is recommending the articles with headline containing _game of thrones_.
2. BoW method doesn't preserve the order of words.

To overcome the first disadvantage we use **TF-IDF** method for feature representation.