<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/scripts/clustering_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of Clustering with tf-idf

In [1]:
# import statements 

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/erusson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import Eval Data

In [2]:
eval_data = pd.read_csv("news_filter/data/eval_data.csv")

eval_data.shape

(200, 13)

In [3]:
eval_data.head()

Unnamed: 0.1,index,Unnamed: 0,id,title,publication,author,date,year,month,url,content,label,text
0,74496,77946,118473,"Chaos in the Family, Chaos in the State: The W...",National Review,Kevin D. Williamson,2016-03-17,2016.0,3.0,http://www.nationalreview.com/article/432876/d...,Michael Brendan Dougherty is bitter. I think t...,0,"Chaos in the Family, Chaos in the State: The W..."
1,71184,74592,113594,US Civil Rights Commission Will Observe Standi...,Buzzfeed News,Nidhi Subbaraman,2016-12-08,2016.0,12.0,https://web.archive.org/web/20161208153906/htt...,WASHINGTON — The US Commission on Civil Ri...,0,US Civil Rights Commission Will Observe Standi...
2,120205,123668,184574,"Venezuela hunts rogue helicopter attackers, Ma...",Reuters,Andrew Cawthorne and Victoria Ramirez,2017-06-29,2017.0,6.0,http://www.reuters.com/article/us-venezuela-po...,The Venezuelan government hunted on Wednesday...,0,"Venezuela hunts rogue helicopter attackers, Ma..."
3,128977,132440,199665,Fruit juice isn’t much better for you than sod...,Vox,Julia Belluz,2016/3/25,2016.0,3.0,http://www.vox.com/2016/3/25/11305614/soda-jui...,One of the biggest public health wins of rece...,0,Fruit juice isn’t much better for you than sod...
4,134837,138300,208223,Sessions won’t testify at congressional budget...,Washington Post,Sari Horwitz,2017-06-10,2017.0,6.0,https://web.archive.org/web/20170611000758/htt...,"Attorney General Jeff Sessions, who had agree...",1,Sessions won’t testify at congressional budget...


## Create tf-idf Matrix of Eval Data

In [0]:
# define tokenizer for tf-idf vectorizer

def preprocess_text(text):

  # function to remove punctuation 
  def Punctuation(string): 
    return re.sub(r'[\W_]', ' ', string)

  # remove punctuation and perform tokenization
  text = Punctuation(text.lower()).split()

  # remove stop words and stem
  stop_words = set(stopwords.words('english'))
  stemmer = SnowballStemmer("english")
  text = Parallel(n_jobs=16)(delayed(stemmer.stem)(t) for t in text if not t in stop_words)

  return text

In [0]:
# set number of features to be proportional to 5k features for 13k samples
num_features = int(200*5000/13000)

# instantiate tfidf vectorizer 
tfidf_vectorizer = TfidfVectorizer(max_features=num_features, stop_words='english', use_idf=True, tokenizer=preprocess_text, ngram_range=(1,3))

In [6]:
# fit text of eval_data to vectorizer (fit and transform)
eval_matrix = tfidf_vectorizer.fit_transform(eval_data.text)
print(eval_matrix.shape)

  'stop_words.' % sorted(inconsistent))


(200, 76)


In [7]:
# sanity check of cosine similarity of eval matrix
cosine_similarity(eval_matrix, eval_matrix) 

array([[1.        , 0.27673884, 0.22421466, ..., 0.07289879, 0.38299339,
        0.28004707],
       [0.27673884, 1.        , 0.34701032, ..., 0.33303765, 0.31704066,
        0.2346992 ],
       [0.22421466, 0.34701032, 1.        , ..., 0.30553886, 0.32502581,
        0.20244243],
       ...,
       [0.07289879, 0.33303765, 0.30553886, ..., 1.        , 0.19593706,
        0.0555759 ],
       [0.38299339, 0.31704066, 0.32502581, ..., 0.19593706, 1.        ,
        0.17704205],
       [0.28004707, 0.2346992 , 0.20244243, ..., 0.0555759 , 0.17704205,
        1.        ]])

## Assign Cluster to Input 

In [0]:
# example user inputs 
input_topics = ["Russian interference with election", "Wikileaks hacking Hillary Clinton's email", "democratic campaigns", "Trump against Obamacare", "Trump's Campaign"]

In [9]:
# create tf-idf matrix for user inputs (transform)
input_matrix = tfidf_vectorizer.transform(input_topics)
print(input_matrix.shape)

(5, 76)


In [10]:
# data frame of titles and semantic similarities
cos_df = pd.DataFrame(cosine_similarity(input_matrix, eval_matrix))
cos_df.columns = eval_data.title
cos_df.index = input_topics

cos_df.shape

(5, 200)

In [11]:
# function to return the column index of the top n values in a row of a dataframe
def find_topind(df, i, n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles/cluster
n = 10

# find index of n most similar titles 
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

# show most similar titles -- predicted clusters
top = Parallel(n_jobs=16)(delayed(find_top)(eval_data.title, ind) for ind in top_ind)

top[:5]
# topics: "Russian interference with election", "Wikileaks hacking Hillary Clinton's email", "democratic campaigns", "Trump against Obamacare", "Trump's Campaign"

[['From disputes to a breakup: wounds still raw after U.S. election',
  'Emmanuel Macron Declared Next French President',
  'Jill Stein files for recount in\xa0Wisconsin',
  'Use the Electoral College Properly or Lose It',
  'The Election Highlighted a Growing Rural-Urban Split ',
  'North Carolina Democrats win extension of voter registration deadline',
  'Joss Whedon: ’I Want a Rhino to F*ck Paul Ryan to Death’',
  'Sessions won’t testify at congressional budget hearings but at Senate intelligence hearing instead',
  'Donald Trump is making the peso great\xa0again',
  'Let’s Agree Not To Lie About GOPCare'],
 ['Bill Clinton’s birthday present from granddaughter Charlotte',
  'Hillary Campaign Denies Report of Campaign Shake-up After New Hampshire ',
  'TRUMP: Adversaries of US ’almost certainly have a blackmail file’ on Clinton',
  'AP: Gov’t Declares 22 Clinton Emails ’Top Secret’',
  'FBI: Clinton ’extremely careless’ but no charges recommended',
  'BIAS ALERT: CNN reporter says Ha

In [12]:
# original annotations of articles in each cluster (true and false positives)
top_lab = Parallel(n_jobs=16)(delayed(find_top)(eval_data.label, ind) for ind in top_ind)

top_lab[:5]

[[0, 0, 0, 3, 0, 0, 0, 1, 0, 0],
 [0, 3, 2, 2, 2, 0, 2, 5, 3, 5],
 [0, 0, 3, 5, 0, 3, 3, 3, 0, 0],
 [0, 5, 0, 0, 5, 1, 0, 5, 5, 0],
 [5, 5, 5, 0, 0, 3, 0, 5, 5, 0]]

In [0]:
# how many articles annotated to each label (ground truth)
num_lab = dict(Counter(eval_data.label))

In [14]:
# how many of articles land in correct cluster (true positives)
i=1
true_pos = []
for cluster in top_lab:
  true_pos.append(sum([l == i for l in cluster]))
  i+=1
true_pos

[1, 4, 4, 0, 5]

In [15]:
# percent of correct predictions out of original assignments (recall)
recall = []
for i in range(len(true_pos)):
  recall.append(true_pos[i]/num_lab[i+1])

# average recall 
np.mean(recall)

0.4083333333333333

In [16]:
# percent of correct predictions out of all predictions (precision)
precision = [i/n for i in true_pos]

# average precision
np.mean(precision)

0.27999999999999997