In [1]:
import warnings
warnings.filterwarnings('ignore')

from __future__ import division, print_function, unicode_literals

import matplotlib.pyplot as plt
%matplotlib inline

import os
import numpy as np
import pandas as pd
import math

from IPython.display import display, HTML

Dataset: https://www.kaggle.com/snapcrack/all-the-news

In [2]:
import random
path = '/home/pratikshasahu/Documents/EIDETIA/env/data/articles1.csv'

df = pd.read_csv(path, skiprows=0, nrows=1000)
df.rename(columns={'Unnamed: 0':'index'}, inplace=True)

df.head(5)

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


English stopwords list used below was built by Gerard Salton and Chris Buckley for the experimental SMART information retrieval system at Cornell University. It is available here: http://www.lextek.com/manuals/onix/stopwords2.html

In [3]:
import nltk
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

myfile = open('/home/pratikshasahu/Documents/EIDETIA/env/smart_stopwords.txt')
stop_words = myfile.read().split()
myfile.close()

corpus = []

for i in range(len(df['index'])):
    text = re.sub('[^a-zA-Z]', ' ', df['content'][i]) # remove punctuations
    text = text.lower()                               # convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)  # remove tags
    text = re.sub("(\\d|\\W)+"," ",text)              # remove special characters and digits
    text = text.split()                               # convert to list from string
    
    lem = WordNetLemmatizer()                         # lemmatisation
    text = [lem.lemmatize(word) for word in text if not word in stop_words] 
    
    text = " ".join(text)
    
    corpus.append(text)

corpus[:5]

['washington congressional republican fear health care lawsuit obama administration win incoming trump administration choose longer defend executive branch suit challenge administration authority spend billion dollar health insurance subsidy american handing house republican big victory issue sudden loss disputed subsidy conceivably health care program implode leaving million people access health insurance republican prepared replacement lead chaos insurance market spur political backlash republican gain full control government stave outcome republican find awkward position appropriating huge sum temporarily prop obama health care law angering conservative voter demanding end law year twist donald trump administration worried preserving executive branch prerogative choose fight republican ally house central question dispute eager avoid ugly political pileup republican capitol hill trump transition team gaming handle lawsuit election put limbo late february united state court appeal dis

In [4]:
corpus_list = []

for i in range(len(corpus)):
    tokened_article = corpus[i].split()
    corpus_list.append(tokened_article)

corpus_list[:5]

[['washington',
  'congressional',
  'republican',
  'fear',
  'health',
  'care',
  'lawsuit',
  'obama',
  'administration',
  'win',
  'incoming',
  'trump',
  'administration',
  'choose',
  'longer',
  'defend',
  'executive',
  'branch',
  'suit',
  'challenge',
  'administration',
  'authority',
  'spend',
  'billion',
  'dollar',
  'health',
  'insurance',
  'subsidy',
  'american',
  'handing',
  'house',
  'republican',
  'big',
  'victory',
  'issue',
  'sudden',
  'loss',
  'disputed',
  'subsidy',
  'conceivably',
  'health',
  'care',
  'program',
  'implode',
  'leaving',
  'million',
  'people',
  'access',
  'health',
  'insurance',
  'republican',
  'prepared',
  'replacement',
  'lead',
  'chaos',
  'insurance',
  'market',
  'spur',
  'political',
  'backlash',
  'republican',
  'gain',
  'full',
  'control',
  'government',
  'stave',
  'outcome',
  'republican',
  'find',
  'awkward',
  'position',
  'appropriating',
  'huge',
  'sum',
  'temporarily',
  'prop',
 

# Automated Keyword Extraction

> Automatic identification of terms that best describe the subject of a document. Key phrases, key terms, key segments or just keywords are the terminology which is used for defining the terms that represent the most relevant information contained in the document.

Method used: TfidfVectorizer

Reference: http://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.XLX1EUPhXeQ

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix 

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

tv = TfidfVectorizer(max_df=1, min_df = 0.05, stop_words=stop_words, max_features=10000, 
                                ngram_range=(1,3), smooth_idf=True, use_idf=True)
kw = []

for index in df['index']:
    
    tfidf_matrix = tv.fit_transform([corpus[index]])
    feature_names = tv.get_feature_names()
    
    sorted_items = sort_coo(tfidf_matrix.tocoo())
    keywords = extract_topn_from_vector(feature_names, sorted_items, topn=100)
    # keywords = {'republican': 0.296, 'house republican': 0.276, ...} <- for index = 0
    #            {'precinct': 0.586, 'detective': 0.403, ...} <- for index = 1
    
    kw.append(list(keywords.keys())) 

df['keywords'] = kw
df.head(5)

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,keywords
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr..."
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north..."


# Topic Modelling

> Statistical modeling for discovering the abstract “topics” that occur in a collection of documents

Methods used: LDA (Latent Dirichlet Allocation)

Reference: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

In [6]:
df['lda_topics'] = ''

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

no_features = 1000
no_topic = 1
no_top_words = 10

for index in df['index']:

    tf_vec = CountVectorizer(max_df = 0.95, min_df = 1, max_features=no_features)
    tf_mat = tf_vec.fit_transform(corpus_list[index])
    tf_feature_names = tf_vec.get_feature_names()
    
    lda = LatentDirichletAllocation(n_components=no_topic, 
                                    max_iter=5, 
                                    learning_method = 'online', 
                                    learning_offset=50., 
                                    random_state=0
                                   ).fit(tf_mat)
    
    for topic_index, topic in enumerate(lda.components_):
        df.lda_topics[index] = [tf_feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]
        
df.head(5)

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,keywords,lda_topics
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr...","[house, republican, administration, health, tr..."
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f...","[precinct, detective, fernandez, officer, poli..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch...","[wong, disney, artist, father, work, chinese, ..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,...","[death, died, year, time, people, life, day, m..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north...","[north, korea, missile, ballistic, test, kim, ..."


# Document Classification

> Assign a document to one or more classes or categories

Methods Used: Jaccard Similarity & Cosine Similarity

Reference: https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50

In [7]:
def file_to_list(file):
    path = '/home/pratikshasahu/Documents/EIDETIA/env/data/'
    path += file
    return [line.rstrip('\n') for line in open(path)]

p_list = file_to_list('politics')
b_list = file_to_list('business')
e_list = file_to_list('entertainment')
s_list = file_to_list('sports')
t_list = file_to_list('technology')

df['category_js'] = ''

def jaccard_sim(str1, str2): 
    a = set(str1) 
    b = set(str2)
    c = a.intersection(b)
    res = float(len(c)) / (len(a) + len(b) - len(c))
    return res
     
for x in df['index']:
    
    p_js = jaccard_sim(df['keywords'][x], p_list)
    b_js = jaccard_sim(df['keywords'][x], b_list) 
    e_js = jaccard_sim(df['keywords'][x], e_list) 
    s_js = jaccard_sim(df['keywords'][x], s_list) 
    t_js = jaccard_sim(df['keywords'][x], t_list) 
    
    js = max(p_js, b_js, e_js, s_js, t_js)
    if js == p_js:
        df['category_js'][x] = 'politics'
    elif js == b_js:
        df['category_js'][x] = 'business'
    elif js == e_js:
        df['category_js'][x] = 'entertainment'
    elif js == s_js:
        df['category_js'][x] = 'sports'
    else:
        df['category_js'][x] = 'tech'
        
df.head(10)

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,keywords,lda_topics,category_js
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr...","[house, republican, administration, health, tr...",politics
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f...","[precinct, detective, fernandez, officer, poli...",politics
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch...","[wong, disney, artist, father, work, chinese, ...",entertainment
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,...","[death, died, year, time, people, life, day, m...",entertainment
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north...","[north, korea, missile, ballistic, test, kim, ...",politics
5,5,17288,"Sick With a Cold, Queen Elizabeth Misses New Y...",New York Times,Sewell Chan,2017-01-02,2017.0,1.0,,"LONDON — Queen Elizabeth II, who has been b...","[queen, church, service, year, week, sandringh...","[queen, church, service, day, missed, year, mo...",politics
6,6,17289,Taiwan’s President Accuses China of Renewed In...,New York Times,Javier C. Hernández,2017-01-02,2017.0,1.0,,BEIJING — President Tsai of Taiwan sharpl...,"[tsai, taiwan, china, beijing, trump, island, ...","[tsai, taiwan, china, visit, beijing, transit,...",politics
7,7,17290,"After ‘The Biggest Loser,’ Their Bodies Fought...",New York Times,Gina Kolata,2017-02-08,2017.0,2.0,,"Danny Cahill stood, slightly dazed, in a blizz...","[weight, calorie, pound, people, contestant, d...","[weight, calorie, people, pound, dr, contestan...",tech
8,8,17291,"First, a Mixtape. Then a Romance. - The New Yo...",New York Times,Katherine Rosman,2016-12-31,2016.0,12.0,,"Just how is Hillary Kerr, the founder of ...","[kerr, leahy, friend, music, song, made, met, ...","[kerr, leahy, friend, music, song, made, bride...",entertainment
9,9,17292,Calling on Angels While Enduring the Trials of...,New York Times,Andy Newman,2016-12-31,2016.0,12.0,,Angels are everywhere in the Muñiz family’s ap...,"[mu, mu iz, iz, family, angel, jos, time, chil...","[mu, iz, family, angel, jos, time, child, york...",business


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def file_to_str(file):
    path = '/home/pratikshasahu/Documents/EIDETIA/env/data/'
    path += file
    return ' '.join([line.rstrip('\n') for line in open(path)])

b_str = file_to_str('business')
p_str = file_to_str('politics')
e_str = file_to_str('entertainment')
s_str = file_to_str('sports')
t_str = file_to_str('technology')

df['category_cs'] = ''

def cosine_sim(*strs): 
    text = [t for t in strs]
    vectorizer = TfidfVectorizer()
    vectorizer.fit(text)
    get_vectors = vectorizer.transform(text).toarray()
    
    vectors = [t for t in get_vectors]
    return cosine_similarity(vectors)[0][1]   
    

for x in df['index']:
    
    p_cs = cosine_sim(' '.join(df['keywords'][x]), p_str)
    b_cs = cosine_sim(' '.join(df['keywords'][x]), b_str)
    e_cs = cosine_sim(' '.join(df['keywords'][x]), e_str)
    s_cs = cosine_sim(' '.join(df['keywords'][x]), s_str)
    t_cs = cosine_sim(' '.join(df['keywords'][x]), t_str)
    
    cs = max(p_cs, b_cs, t_cs, e_cs, s_cs)
    if cs == p_cs:
        df['category_cs'][x] = 'politics'
    elif cs == b_cs:
        df['category_cs'][x] = 'business'
    elif cs == e_cs:
        df['category_cs'][x] = 'entertainment'
    elif cs == s_cs:
        df['category_cs'][x] = 'sports'
    else:
        df['category_cs'][x] = 'tech'

df.head(10)

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,keywords,lda_topics,category_js,category_cs
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,"[house, republican, administration, health, tr...","[house, republican, administration, health, tr...",politics,politics
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f...","[precinct, detective, fernandez, officer, poli...",politics,business
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...","[wong, artist, disney, father, work, tyrus, ch...","[wong, disney, artist, father, work, chinese, ...",entertainment,entertainment
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...","[death, died, year, time, people, star, music,...","[death, died, year, time, people, life, day, m...",entertainment,entertainment
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north...","[north, korea, missile, ballistic, test, kim, ...",politics,tech
5,5,17288,"Sick With a Cold, Queen Elizabeth Misses New Y...",New York Times,Sewell Chan,2017-01-02,2017.0,1.0,,"LONDON — Queen Elizabeth II, who has been b...","[queen, church, service, year, week, sandringh...","[queen, church, service, day, missed, year, mo...",politics,sports
6,6,17289,Taiwan’s President Accuses China of Renewed In...,New York Times,Javier C. Hernández,2017-01-02,2017.0,1.0,,BEIJING — President Tsai of Taiwan sharpl...,"[tsai, taiwan, china, beijing, trump, island, ...","[tsai, taiwan, china, visit, beijing, transit,...",politics,sports
7,7,17290,"After ‘The Biggest Loser,’ Their Bodies Fought...",New York Times,Gina Kolata,2017-02-08,2017.0,2.0,,"Danny Cahill stood, slightly dazed, in a blizz...","[weight, calorie, pound, people, contestant, d...","[weight, calorie, people, pound, dr, contestan...",tech,tech
8,8,17291,"First, a Mixtape. Then a Romance. - The New Yo...",New York Times,Katherine Rosman,2016-12-31,2016.0,12.0,,"Just how is Hillary Kerr, the founder of ...","[kerr, leahy, friend, music, song, made, met, ...","[kerr, leahy, friend, music, song, made, bride...",entertainment,entertainment
9,9,17292,Calling on Angels While Enduring the Trials of...,New York Times,Andy Newman,2016-12-31,2016.0,12.0,,Angels are everywhere in the Muñiz family’s ap...,"[mu, mu iz, iz, family, angel, jos, time, chil...","[mu, iz, family, angel, jos, time, child, york...",business,entertainment


In [10]:
df.query('category_js != category_cs')

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,keywords,lda_topics,category_js,category_cs
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...","[precinct, detective, police, officer, year, f...","[precinct, detective, fernandez, officer, poli...",politics,business
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...","[north, korea, missile, ballistic, test, north...","[north, korea, missile, ballistic, test, kim, ...",politics,tech
5,5,17288,"Sick With a Cold, Queen Elizabeth Misses New Y...",New York Times,Sewell Chan,2017-01-02,2017.0,1.0,,"LONDON — Queen Elizabeth II, who has been b...","[queen, church, service, year, week, sandringh...","[queen, church, service, day, missed, year, mo...",politics,sports
6,6,17289,Taiwan’s President Accuses China of Renewed In...,New York Times,Javier C. Hernández,2017-01-02,2017.0,1.0,,BEIJING — President Tsai of Taiwan sharpl...,"[tsai, taiwan, china, beijing, trump, island, ...","[tsai, taiwan, china, visit, beijing, transit,...",politics,sports
9,9,17292,Calling on Angels While Enduring the Trials of...,New York Times,Andy Newman,2016-12-31,2016.0,12.0,,Angels are everywhere in the Muñiz family’s ap...,"[mu, mu iz, iz, family, angel, jos, time, chil...","[mu, iz, family, angel, jos, time, child, york...",business,entertainment
10,10,17293,Weak Federal Powers Could Limit Trump’s Climat...,New York Times,Justin Gillis,2017-01-03,2017.0,1.0,,With Donald J. Trump about to take control of ...,"[trump, energy, global, wind, state, climate, ...","[year, trump, politics, reality, divorced, his...",politics,business
11,11,17294,Can Carbon Capture Technology Prosper Under Tr...,New York Times,John Schwartz,2017-01-05,2017.0,1.0,,"THOMPSONS, Tex. — Can one of the most promi...","[carbon, capture, carbon capture, plant, techn...","[carbon, capture, plant, technology, power, co...",business,tech
12,12,17295,"Mar-a-Lago, the Future Winter White House and ...",New York Times,Maggie Haberman,2017-01-02,2017.0,1.0,,"WEST PALM BEACH, Fla. — When Donald J. Tr...","[trump, club, member, guest, year, club member...","[trump, club, member, guest, year, property, n...",politics,sports
13,13,17296,How to form healthy habits in your 20s - The N...,New York Times,Charles Duhigg,2017-01-02,2017.0,1.0,,This article is part of a series aimed at help...,"[habit, reward, time, exercise, cue, start, in...","[plan, insurance, habit, reward, time, cue, ex...",business,tech
14,14,17297,Turning Your Vacation Photos Into Works of Art...,New York Times,Stephanie Rosenbloom,2017-04-14,2017.0,4.0,,It’s the season for family travel and photos ...,"[photo, print, site, scan, time, wood, fabric,...","[photo, print, site, scan, time, company, wood...",tech,entertainment
