# Thoughts on building a rec system to find users

In [1]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pickle 

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline

#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# spacy
import spacy

from preprocessing_funcs import clean_tweet, get_hashtags, get_emojis

In [2]:
with open('../data_files/processed_tweets.pickle', 'rb') as read_file:
    df = pickle.load(read_file)

In [3]:
df.head()

Unnamed: 0,text,processed
0,#IslamKills Are you trying to say that there w...,islam kill try say terrorist attack europe ref...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump apologize attack little
2,RT @ltapoll: Who was/is the best president of ...,well president past retweet
3,RT @jww372: I don't have to guess your religio...,guess religion christmas aftermath
4,RT @Shareblue: Pence and his lawyers decided w...,pence lawyer decide official email public can see


In [4]:
text = pd.read_csv('../data_files/tweets.csv.zip')
text.head(3)

Unnamed: 0,user_id,user_key,created_at,created_str,retweet_count,retweeted,favorite_count,text,tweet_id,source,hashtags,expanded_urls,posted,mentions,retweeted_status_id,in_reply_to_status_id
0,1868981000.0,ryanmaxwell_1,1458672000000.0,2016-03-22 18:31:42,,,,#IslamKills Are you trying to say that there w...,7.12346e+17,,"[""IslamKills""]",[],POSTED,[],,
1,2571870000.0,detroitdailynew,1476133000000.0,2016-10-10 20:57:00,0.0,False,0.0,"Clinton: Trump should’ve apologized more, atta...",7.855849e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",[],"[""http://detne.ws/2e172jF""]",POSTED,[],,
2,1710805000.0,cookncooks,1487767000000.0,2017-02-22 12:43:43,,,,RT @ltapoll: Who was/is the best president of ...,8.343832e+17,,[],[],POSTED,[],,


In [5]:
tweets_details = text[['user_key', 'created_str']]
tweets_details.head(3)

Unnamed: 0,user_key,created_str
0,ryanmaxwell_1,2016-03-22 18:31:42
1,detroitdailynew,2016-10-10 20:57:00
2,cookncooks,2017-02-22 12:43:43


In [8]:
proc_tweets = df.processed

## NMF Topic Modeling

### CountVectorizer

In [None]:
vectorizer = CountVectorizer(#ngram_range=(1, 2),
                             binary=True
                             stop_words='english')

In [None]:
%%time

nmf_cv = NMF(n_components=12)

# pipeline of cv and nmf, fit and applied to docs: 

nfm_transformer = Pipeline([('cv', vectorizer),
                           ('nmf', nmf_cv)])

nmf_matrix = nfm_transformer.fit(proc_tweets)

In [11]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
terms = vectorizer.get_feature_names()

display_topics(nmf_cv, terms, 10)

### TF-IDF

In [14]:
tfidf = TfidfVectorizer(ngram_range=(1,3),
                             use_idf=True, 
                             smooth_idf=True,
                             stop_words='english')

In [21]:
%%time

nmf_tfidf = NMF(n_components=16, random_state=42)

# pipeline of cv and nmf, fit and applied to docs: 

nfm_transformer = Pipeline([('tfidf', tfidf),
                           ('nmf', nmf_tfidf)])

nmf_matrix = nfm_transformer.fit_transform(proc_tweets)



CPU times: user 4min 39s, sys: 17.9 s, total: 4min 57s
Wall time: 3min 48s


In [22]:
terms = tfidf.get_feature_names()

display_topics(nmf_tfidf, terms, 20)


Topic  0
donald, donald trump, real, real donald trump, real donald, trump, donald trump hillary, donald trump trump, trump real, trump real donald, trump hillary, donald trump make, maga, trump hillary clinton, donald trump president, trump trump, donald trump say, country, clinton real, support

Topic  1
twitter, conservative, conservative twitter, christian conservative, christian, christian conservative twitter, conservative twitter christian, twitter christian, twitter christian conservative, tea, party, tea party, twitter conservative, conservative twitter conservative, twitter conservative twitter, gop, twitter patriot, conservative twitter patriot, twitter patriot journalist, conservative twitter tea

Topic  2
hillary, clinton, hillary clinton, email, campaign, politic, crook, crook hillary, lie, trump hillary, clinton campaign, prison, foundation, obama, trust, trust hillary, clinton foundation, fbi, thing trust, thing trust hillary

Topic  3
isis, ice isis, ice, target, op i

In [25]:
nmf_matrix.shape[1]

16

In [27]:
topic_cols = [f'topic_{x}' for x in range(nmf_matrix.shape[1])]
topic_cols

['topic_0',
 'topic_1',
 'topic_2',
 'topic_3',
 'topic_4',
 'topic_5',
 'topic_6',
 'topic_7',
 'topic_8',
 'topic_9',
 'topic_10',
 'topic_11',
 'topic_12',
 'topic_13',
 'topic_14',
 'topic_15']

In [28]:
doc_topic_matrix = pd.DataFrame(nmf_matrix, columns=topic_cols)
doc_topic_matrix.head(3)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,0.0,0.000564,0.000688,0.001131815,0.003101,0.000887,0.001131,0.000427,0.0,0.000216,0.000374,0.015512,0.000233,0.0,0.002951,0.0
1,0.0,0.0,0.01524,0.0,0.017806,0.0,0.0,0.0,3.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000315,0.0,2.068372e-07,0.0,0.0,0.0,0.0,7.2e-05,0.0,0.000289,0.0,7.1e-05,7.5e-05,0.000352,0.038215
