# Load Data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import json
import re

In [2]:
df = pd.read_json('C:\\Users\\user\\Downloads\\News_Category_Datasets.json', lines = True)
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [3]:
df.describe() 

Unnamed: 0,category,headline,authors,link,short_description,date
count,200853,200853,200853.0,200853,200853.0,200853
unique,41,199344,27993.0,200812,178353.0,2309
top,POLITICS,Sunday Roundup,,https://www.huffingtonpost.comhttp://www.purpo...,,2013-01-17 00:00:00
freq,32739,90,36620.0,2,19712.0,100
first,,,,,,2012-01-28 00:00:00
last,,,,,,2018-05-26 00:00:00


In [4]:
df['category'].unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'FIFTY', 'ARTS',
       'WELLNESS', 'PARENTING', 'HOME & LIVING', 'STYLE & BEAUTY',
       'DIVORCE', 'WEDDINGS', 'FOOD & DRINK', 'MONEY', 'ENVIRONMENT',
       'CULTURE & ARTS'], dtype=object)

In [5]:
#Sorting data according to date in ascending order
sorted_data=df.sort_values('date', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [6]:
sorted_data

Unnamed: 0,category,headline,authors,link,short_description,date
200852,SPORTS,Dwight Howard Rips Teammates After Magic Loss ...,,https://www.huffingtonpost.com/entry/dwight-ho...,The five-time all-star center tore into his te...,2012-01-28
200816,SCIENCE,"Robots Play Catch, Starring Agile Justin And R...",Travis Korte,https://www.huffingtonpost.com/entry/robots-pl...,"image 1: throw As Hizook reports, DLR started ...",2012-01-28
200815,SCIENCE,Russian Cargo Ship Docks At International Spac...,,https://www.huffingtonpost.com/entry/russian-c...,Gallery: Space Station's Expedition 30 Mission...,2012-01-28
200814,COMEDY,7 Amazing Name Generators (PHOTOS),Seena Vali,https://www.huffingtonpost.com/entry/7-amazing...,Let's be honest: most of our names are pretty ...,2012-01-28
200813,COMEDY,Mitt Romney Madness: Florida Edition (VIDEO),Ben Craw,https://www.huffingtonpost.com/entry/mitt-romn...,The apparent madness that gripped Mitt Romney ...,2012-01-28
...,...,...,...,...,...,...
20,WEIRD NEWS,Weird Father's Day Gifts Your Dad Doesn't Know...,David Moye,https://www.huffingtonpost.com/entry/weird-fat...,Why buy a boring tie when you can give him tes...,2018-05-26
21,ENTERTAINMENT,Twitter #PutStarWarsInOtherFilms And It Was Un...,Andy McDonald,https://www.huffingtonpost.com/entry/twitter-p...,"There's no such thing as too much ""Star Wars.""",2018-05-26
22,WEIRD NEWS,Mystery 'Wolf-Like' Animal Reportedly Shot In ...,Hilary Hanson,https://www.huffingtonpost.com/entry/montana-w...,“We have no idea what this was until we get a ...,2018-05-26
11,WORLD NEWS,South Korean President Meets North Korea's Kim...,,https://www.huffingtonpost.com/entry/south-kor...,The two met to pave the way for a summit betwe...,2018-05-26


In [7]:
sorted_data.shape

(200853, 6)

#  Data Cleaning : Deduplication

In [8]:
#Deduplication of entries
final=df.drop_duplicates(subset={"headline","authors","date","short_description"}, keep='first', inplace=False)
final.shape

(200749, 6)

In [9]:
# printing some random headline
sent_150 = final['headline'].values[150]
print(sent_150)
print("="*50)
sent_15 = final['headline'].values[15]
print(sent_15)

6 Sandy Hook Families, FBI Agent Sue Alex Jones For Defamation
Edward Snowden: There's No One Trump Loves More Than Vladimir Putin


# Preprocessing data

In [10]:
# Combining all the stundents 
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from bs4 import BeautifulSoup

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

preprocessed_sentence = []
# tqdm is for printing the status bar
for sentance in tqdm(final['headline'].values):
    sentance = re.sub(r"http\S+", "", sentance)  # remove urls from text python
    sentance = BeautifulSoup(sentance, 'lxml').get_text()  # remove all tags
    sentance = decontracted(sentance)     # replace english contractions
    sentance = re.sub("\S*\d\S*", "", sentance).strip()  #remove words with numbers python
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)  #remove spacial character
    sentance = ' '.join(token.lower() for token in nltk.word_tokenize(sentance) if token.lower() not in stopwords.words('english'))
    preprocessed_sentence.append(sentance.strip())



100%|█████████████████████████████████████████████████████████████████████████| 200749/200749 [26:40<00:00, 125.39it/s]


In [11]:
# printing same random headline
sent_150 = final['headline'].values[150]
print(sent_150)
print("*"*50)
sent_15 = final['headline'].values[15]
print(sent_15)
print("*"*50)
sent_25 = final['headline'].values[25]
print(sent_25)


6 Sandy Hook Families, FBI Agent Sue Alex Jones For Defamation
**************************************************
Edward Snowden: There's No One Trump Loves More Than Vladimir Putin
**************************************************
Thousands Travel Home To Ireland To Vote On Abortion Access


In [12]:
preprocessed_sentence[150]

'sandy hook families fbi agent sue alex jones defamation'

In [13]:
preprocessed_sentence[15]

'edward snowden one trump loves vladimir putin'

In [14]:
preprocessed_sentence[25]

'thousands travel home ireland vote abortion access'

In [15]:
final.dtypes

category                     object
headline                     object
authors                      object
link                         object
short_description            object
date                 datetime64[ns]
dtype: object

# TF-IDF

In [16]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(preprocessed_sentence)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('*'*50)

final_tf_idf = tf_idf_vect.transform(preprocessed_sentence)[:20000]
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['aap', 'aaron', 'aaron carter', 'aaron paul', 'aaron rodgers', 'aaron schock', 'ab', 'abandon', 'abandoned', 'abandoning']
**************************************************
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (20000, 20179)
the number of unique words including both unigrams and bigrams  20179


# Compute Cosine Similarity

In [17]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
final_tf_idf.toarray()

cosine_sim = cosine_similarity(final_tf_idf, final_tf_idf)
print(cosine_sim)

[[1.         0.         0.         ... 0.12038087 0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.05430482]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.12038087 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.05430482 0.         ... 0.         0.         1.        ]]


In [18]:
indices = pd.Series(final['headline'])
print(indices)

0         There Were 2 Mass Shootings In Texas Last Week...
1         Will Smith Joins Diplo And Nicky Jam For The 2...
2           Hugh Grant Marries For The First Time At Age 57
3         Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4         Julianna Margulies Uses Donald Trump Poop Bags...
                                ...                        
200848    RIM CEO Thorsten Heins' 'Significant' Plans Fo...
200849    Maria Sharapova Stunned By Victoria Azarenka I...
200850    Giants Over Patriots, Jets Over Colts Among  M...
200851    Aldon Smith Arrested: 49ers Linebacker Busted ...
200852    Dwight Howard Rips Teammates After Magic Loss ...
Name: headline, Length: 200749, dtype: object


# Suggesting News headline based on User Choice

In [19]:
def choice(headline, cosine_sim = cosine_sim):
    recommended_news = []
    idx = indices[indices == headline].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:16].index)
    
    for i in top_10_indices:
        recommended_news.append(list(final['headline'])[i])
        
    return recommended_news

In [21]:
choice("Hugh Grant Marries For The First Time At Age 57")

['Hugh Grant Reveals Which Of His Co-Stars Wanted To Kill Him',
 'These Are The 10 Most Popular Destinations For First Time Travelers',
 'In Virginia, Ex-Felons Voted For The First Time After Regaining Their Rights',
 "J.Lo Explains The Meaning Of Her Album Title 'For The First Time'",
 'Senate Judiciary Committee Has 2 Black Members For First Time In Its History',
 'Kylie Jenner Spotted Out In Public For The First Time In Months',
 'U.S. Strikes ISIS In Yemen For The First Time, Killing Dozens Of Militants',
 'Surprise! Kanye West Performs Live For First Time In Nearly A Year',
 'British Vogue Features An Openly Trans Woman For The First Time',
 'Dodgers Reach World Series For First Time Since 1988',
 "'Will & Grace' Stars Perform Show's Theme Tune With Lyrics For First Time",
 'No. 11 Loyola Chicago Reaches Final Four For First Time Since 1963',
 "Germany's Far-Right Party Set To Enter Parliament For The First Time",
 "Elusive Monkey With ‘Beatles-Style' Moptop Seen For The First Tim