In [546]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime

import nltk
import spacy
from textblob import TextBlob

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import words
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer, MWETokenizer

from nltk.sentiment import SentimentIntensityAnalyzer, vader


from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation, NMF

from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, MeanShift

from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

from sklearn import preprocessing

from sklearn import metrics

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import joblib
pd.options.display.colheader_justify = 'right'
pd.options.display.column_space = 1
pd.options.display.expand_frame_repr = True
pd.options.display.max_colwidth = 120

In [548]:
savedir = './plots'

In [38]:
df = joblib.load('data/clean/clean_df.joblib')
df = df.loc[df.body != 'full quote', :] # drop the 'full quote' only comment
df = df[~df.body.str.contains("streamable mirror")] # Drop twitter bot comments

In [3]:
# don't use stemming with Word2Vec

In [4]:
# TODO: Properly do entire data set and a train_test_split later on
# df_small = df.sample(30000, random_state=0)

# df_small_train = df_small[:int(0.7 * df_small.shape[0])]
# df_small_test = df_small[int(0.7 * df_small.shape[0]):]

# TF

### Next steps:
1. I want to find the X topics/clusters about these texts. Perhaps it's fitness, politics, training, performance, GOAT.
2. Figure out which topic each document best corresponds to, and place the document within that topic/cluster
3. Chart a stacked-bar or a multi-line chart for each year (or month, or something else) from 2011-2018, and show
   the PERCENTAGE (Or COUNT) of each topic for that period of time.
4. Look at the insights! What does that mean?
5. Perhaps do sentiment analysis. See if I can find a well-trained model. Or train my own based on upvote? score? (Probably not doing this)

**NOTE**: Simply do LDA first, and make this work. Get a chart. Get some insights.
**THEN:** Make it repeatable. Get it in a function somehow. Try other Dim Red and Unclassifying and Topic modeling techniques. See what works best.

In [39]:
def see_lda_topics(vectorizer, n_topics, fit_lda, n_examples):
    '''
    Prints top 10 names assigned to the topics, and the topics themselves
    '''
    vocab = vectorizer.get_feature_names()
    for topic in range(n_topics):
        print(f"TOPIC {topic}")
        for j in np.argsort(-fit_lda.components_,1)[topic,:n_examples]:
            print(vocab[j])
        print()

In [105]:
words_corpus = set(words.words())
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

stops = stopwords.words('english')
stops += ['lebron', 'james', 'game', 'us', 'need', 'let', 'well', 'year', 'dude', 'could', 'couldnt',
          'gon', 'gonna', 'na', 'ca', 'really', 'man', 'much',# 'last', 'next',
          'just', 'lol', 'like', 'im', 'he', 'hes', 'would', 'get', 'going', 'got', 'every', 'shit',
          'doesnt', 'th', 'fuck', 'think', 'even', 'dont', 'even', 'pretty', 'really', 'one',
          'didnt', 'cant', 'say', 'see', 'look', 'go', 'said', 'also', 'still', 'good'
         ]
stops = set(stops)
acceptable_words = words_corpus - stops

# create a custom vectorizer class that inherits from base class
# add a few more custom preprocessing and tokenization steps
class CustomVectorizer(CountVectorizer):  
    def build_analyzer(self):
        stop_words = stops
        
        def analyzer(doc):
            
            # preprocess: do any further cleaning here if needed
            # example: remove everything but letters
#             cleaned_doc = re.sub(r"[^A-Za-z]", " ", doc)
            cleaned_doc = doc
    
            # instantiate snowball stemmer
            stemmer = SnowballStemmer("english")
            # create tokens
            tokens = [stemmer.stem(d) for d in word_tokenize(cleaned_doc) if d in acceptable_words]
            tokens = [token for token in tokens if len(token) > 3]
            
            # multi-word expression tokenizer
            tokenizer = MWETokenizer()
            tokenizer.add_mwe(('k', 'love'))
            tokenizer.add_mwe(('kevin', 'love'))
            tokenizer.add_mwe(('dwayne', 'wade'))
            tokenizer.add_mwe(('d', 'wade'))
            tokenizer.add_mwe(('jr', 'smith'))
            tokenizer.add_mwe(('j', 'r', 'smith'))
            tokenizer.add_mwe(('j', 'r'))
            # tokenize the tokens
            tokens = tokenizer.tokenize(tokens)
            
            return(self._word_ngrams(tokens, stop_words))
        return analyzer

In [50]:
# def english_corpus(doc, stemmer=stem):
#     clean_words = [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]   
#     return [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]

Current isues:
* can't use stemmer and n_grams at same time.
* still too many junk words, even with growing stop_list
* **don't have a lemmatizer in there. How can i do lemmatize, n_grams, stop words, and stemmer all in analyer?**

In [51]:
# BI GRAMS

In [52]:
# cVectorizer = CustomVectorizer(# strip_accents='ascii',
#                                max_df=0.70,
#                                min_df=4,
#                                stop_words = stops,
#                                ngram_range=(2,2)
#                               )

# dtm_tf_custom = cVectorizer.fit_transform(df_small_train.body)
# #dtm_tf_custom.shape

# for n_topic in range(4,10):
#     loop_lda_tf = LatentDirichletAllocation(n_components=n_topic, random_state=0)
#     loop_lda_tf.fit(dtm_tf_custom)
#     print(':::::')
#     print(f'Number of topics = {n_topic}')
#     print(':::::')
#     see_lda_topics(cVectorizer, n_topic, loop_lda_tf, 10)

In [53]:
# MOSTLY UNI GRAMS. ALLOWS BI AND TRI GRAMS

In [67]:
cVectorizer = CustomVectorizer(# strip_accents='ascii',
                               max_df=0.7,
                               min_df=4,
                               stop_words = stops,
                               ngram_range=(1,3)
                              )

dtm_tf_custom = cVectorizer.fit_transform(df.body)
#dtm_tf_custom.shape

for n_topic in range(7,10):
    loop_lda_tf = LatentDirichletAllocation(n_components=n_topic, random_state=0, n_jobs=-1)
    loop_lda_tf.fit(dtm_tf_custom)
    print(':::::')
    print(f'Number of topics = {n_topic}')
    print(':::::')
    see_lda_topics(cVectorizer, n_topic, loop_lda_tf, 15)

:::::
Number of topics = 7
:::::
TOPIC 0
one
season
better
player
best
leagu
point
ever
shoot
year
curri
make
per
defens
time

TOPIC 1
love
time
year
play
great
coach
defens
day
one
mani
last
actual
feel
post
back

TOPIC 2
win
team
could
top
basketbal
right
time
peopl
goat
take
player
make
last
probabl
one

TOPIC 3
team
play
pick
pass
best
take
player
anyon
way
better
bad
someon
els
big
everyon

TOPIC 4
wade
peopl
guy
come
heat
want
talk
new
know
confirm
give
nice
fan
team
hate

TOPIC 5
team
year
back
first
win
season
know
one
best
come
tripl
doubl
next
trade
make

TOPIC 6
watch
damn
dude
ball
high
play
jordan
call
know
actual
time
never
k
great
foul

:::::
Number of topics = 8
:::::
TOPIC 0
one
player
best
leagu
make
ever
shoot
season
point
seen
better
defens
curri
team
time

TOPIC 1
love
time
great
year
play
coach
defens
actual
day
one
post
way
lue
last
feel

TOPIC 2
win
basketbal
top
right
could
time
team
peopl
goat
take
player
make
beat
probabl
last

TOPIC 3
best
team
play
player
p

In [55]:
# BI and TRI-GRAMS

In [68]:
cVectorizer = CustomVectorizer(# strip_accents='ascii',
                               max_df=0.9,
                               min_df=2,
                               stop_words = stops,
                               ngram_range=(2,4)
                              )

dtm_tf_custom = cVectorizer.fit_transform(df_small.body)
#dtm_tf_custom.shape

for n_topic in range(2,6):
    loop_lda_tf = LatentDirichletAllocation(n_components=n_topic, random_state=0, n_jobs=-1)
    loop_lda_tf.fit(dtm_tf_custom)
    print(':::::')
    print(f'Number of topics = {n_topic}')
    print(':::::')
    see_lda_topics(cVectorizer, n_topic, loop_lda_tf, 15)

:::::
Number of topics = 2
:::::
TOPIC 0
last year
high school
tripl doubl
ever seen
regular season
next year
golden state
eastern confer
first round
dan gilbert
free throw
never seen
best ever
win championship
last season

TOPIC 1
best player
regular season
year old
player leagu
come back
best player leagu
god damn
one best
basketbal player
player world
wade bosh
last year
locker room
first time
watch play

:::::
Number of topics = 3
:::::
TOPIC 0
high school
ever seen
last year
eastern confer
one time
anyon els
derrick rose
point guard
never seen
win championship
top player
come back
role model
best ever
space jam

TOPIC 1
best player
player leagu
come back
best player leagu
locker room
year old
god damn
wade bosh
first time
make sure
big deal
back back
player world
three point
player team

TOPIC 2
last year
tripl doubl
regular season
next year
year old
basketbal player
one best
best player
golden state
free throw
dan gilbert
last season
player ever
everyon els
first round

:::::
Num

# Run chosen LDA model, cluster, then plot

In [4]:
cVectorizer = CustomVectorizer(# strip_accents='ascii',
                               max_df=0.7,
                               min_df=4,
                               stop_words = stops,
                               ngram_range=(1,3)
                              )

dtm_tf_custom = cVectorizer.fit_transform(df.body)
#dtm_tf_custom.shape

lda = LatentDirichletAllocation(n_components=9, random_state=0, n_jobs=-1)
#loop_lda_tf.fit(dtm_tf_custom)

transformed_lda = lda.fit_transform(dtm_tf_custom)

In [71]:
pyLDAvis.sklearn.prepare(lda, dtm_tf_custom, cVectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [202]:
# cossim = cosine_similarity(transformed_lda[:1000]) USE THIS ONE
#eucdist = euclidean_distances(transformed_lda[:20000])

# # generate cosine_similarity and euclidian distance scores. They seem quite similar. For now i'll choose cosine similarity
# print(df.body.iloc[6])
# print(df.body.iloc[7])
# print(cossim[6,7])
# print(eucdist[6,7])

# Use Clustering Algo to group documents into X num_topics by most similar topics

In [145]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=700, ngram_range=(1,3), stop_words=stops)
tfidf = tfidf_vectorizer.fit_transform(df.reset_index().body)
lda_model = LatentDirichletAllocation(n_components=16, max_iter=10, n_jobs=-1, learning_method='batch', learning_offset=50.,random_state=0).fit(tf)

# pyLDAvis.sklearn.prepare(lda_model, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# PICK A CLUSTERING ALGO THAT USES COSINE SIMILARITY
### PERHAPS DBSCAN
##### create clusters (start with a small number, cluster_num DOES NOT need to equal the number of topics)

#----------- Chad recommended this:
# plot by argmax (basically saying which topic is most dominant in each document)
# plot by a groupby(time) and then mean the values of each topic. so then i'll have "documents for jan 2011 have [t1, t2, t3, t4, t5] and plot that.
# look into a hard clusting like Kmeans (right now, let's put this behind us)

In [248]:
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

In [413]:
lda_W.shape

(45657, 16)

In [409]:
lda_W[:2]

array([[0.02083333, 0.35416664, 0.02083333, 0.02083333, 0.02083333,
        0.02083333, 0.35416667, 0.02083333, 0.02083333, 0.02083333,
        0.02083333, 0.02083333, 0.02083336, 0.02083333, 0.02083333,
        0.02083333],
       [0.25881531, 0.00892858, 0.15211729, 0.00892857, 0.00892857,
        0.16842655, 0.00892859, 0.3134979 , 0.00892857, 0.00892859,
        0.00892858, 0.0089286 , 0.00892857, 0.00892857, 0.00892858,
        0.00892857]])

In [412]:
lda_topic_index = []
for i in lda_W:
    lda_topic_index.append(np.argmax(i))
len(lda_topic_index)

45657

In [415]:
lda_topic_index[0:13]

[6, 7, 13, 7, 0, 5, 2, 2, 7, 5, 4, 1, 5]

In [422]:
lda_topics_df = pd.DataFrame(lda_topic_index, copy=True, columns=['topics'])
lda_topics_df['time'] = df.reset_index()['created_utc']

# Pickle Models for plots in other notebook

In [795]:
model_dir = './pickled_models'
joblib.dump(lda_W, f'{model_dir}/lda_W.joblib')
joblib.dump(lda_topics_df, f'{model_dir}/lda_topics_df.joblib')