In [1]:
# Imports would go here

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime

import nltk
import spacy
from textblob import TextBlob

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import words
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer, MWETokenizer

from nltk.sentiment import SentimentIntensityAnalyzer, vader


from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation, NMF

from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, MeanShift

from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

from sklearn import preprocessing

from sklearn import metrics

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import joblib
pd.options.display.colheader_justify = 'right'
pd.options.display.column_space = 1
pd.options.display.expand_frame_repr = True
pd.options.display.max_colwidth = 120

In [2]:
df = joblib.load('data/clean/clean_df.joblib')
df = df.loc[df.body != 'full quote', :] # drop the 'full quote' only comment
df = df[~df.body.str.contains("streamable mirror")] # Drop twitter bot comments

In [3]:
words_corpus = set(words.words())
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

stops = stopwords.words('english')
stops += ['lebron', 'james', 'game', 'us', 'need', 'let', 'well', 'year', 'dude', 'could', 'couldnt',
          'gon', 'gonna', 'na', 'ca', 'really', 'man', 'much',# 'last', 'next',
          'just', 'lol', 'like', 'im', 'he', 'hes', 'would', 'get', 'going', 'got', 'every', 'shit',
          'doesnt', 'th', 'fuck', 'think', 'even', 'dont', 'even', 'pretty', 'really', 'one',
          'didnt', 'cant', 'say', 'see', 'look', 'go', 'said', 'also', 'still', 'good'
         ]
stops = set(stops)
acceptable_words = words_corpus - stops

# create a custom vectorizer class that inherits from base class
# add a few more custom preprocessing and tokenization steps
class CustomVectorizer(CountVectorizer):  
    def build_analyzer(self):
        stop_words = stops
        
        def analyzer(doc):
            
            # preprocess: do any further cleaning here if needed
            # example: remove everything but letters
#             cleaned_doc = re.sub(r"[^A-Za-z]", " ", doc)
            cleaned_doc = doc
    
            # instantiate snowball stemmer
            stemmer = SnowballStemmer("english")
            # create tokens
            tokens = [stemmer.stem(d) for d in word_tokenize(cleaned_doc) if d in acceptable_words]
            tokens = [token for token in tokens if len(token) > 3]
            
            # multi-word expression tokenizer
            tokenizer = MWETokenizer()
            tokenizer.add_mwe(('k', 'love'))
            tokenizer.add_mwe(('kevin', 'love'))
            tokenizer.add_mwe(('dwayne', 'wade'))
            tokenizer.add_mwe(('d', 'wade'))
            tokenizer.add_mwe(('jr', 'smith'))
            tokenizer.add_mwe(('j', 'r', 'smith'))
            tokenizer.add_mwe(('j', 'r'))
            # tokenize the tokens
            tokens = tokenizer.tokenize(tokens)
            
            return(self._word_ngrams(tokens, stop_words))
        return analyzer

# Test NMF and Display topics better

In [4]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("")
        print(f"Topic {topic_idx}")
        print("TOP WORDS:", " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort(W[:,topic_idx])[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print("DOC:", documents[doc_index][:100])

dataset = df.reset_index()
documents = dataset.body

no_features = 700

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, ngram_range=(1,3), stop_words=stops)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
# tf_vectorizer = CustomVectorizer(# strip_accents='ascii',
#                                max_df=0.95,
#                                min_df=2,
#                                stop_words = stops,
#                                max_features=no_features,
#                                ngram_range=(1,3)
#                               )
# tf = tf_vectorizer.fit_transform(documents)
# tf_feature_names = tf_vectorizer.get_feature_names()

In [7]:
# no_topics = 30

for no_topics in range(25, 31, 1):

    # Run NMF
    nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
    nmf_W = nmf_model.transform(tfidf)
    nmf_H = nmf_model.components_

#     # Run LDA
#     lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=10, n_jobs=-1, learning_method='batch', learning_offset=50.,random_state=0).fit(tf)
#     lda_W = lda_model.transform(tf)
#     lda_H = lda_model.components_

    no_top_words = 14
    no_top_documents = 8
    print("")
    print(f"{no_topics} TOPICS for TF-IDF and NMF RESULTS")
    print("")
    display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
#     print("")
#     print(f"{no_topics} TOPICS for LDA and DTM RESULTS")
#     print("")
#     display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)


25 TOPICS for TF-IDF and NMF RESULTS


Topic 0
TOP WORDS: know years way make players never take first point getting guys want ball games
DOC: it's hard to take the long view on james simple as that when i read bill simmons' book something cli
DOC: from this article some excerpts james sometimes wonders what would happen if he took as many shots a
DOC: i don't know what happened i just know the shots wouldn't stop going in after about the fifth dagger
DOC: for anyone who doesn't want to have to leave reddit q first the obvious how are you feeling lebron j
DOC: is lebron james coasting is the heat star going easy during this stretch of the long game slog by to
DOC: i don't know if the ncaa is corrupt but i do know that it cares way too much about the amateur statu
DOC: i played against lebron a handful of times when he was a freshman so i have a little experience in t
DOC: it makes me so sick seeing all of you r nba psychics saying that lebron leaving is a foregone conclu

Topic 1
TOP 

In [231]:
ms = MeanShift(bin_seeding=True, cluster_all=False, n_jobs=-1)
ms_model = ms.fit(lda_W)

In [233]:
ms_labels = ms.labels_
# Number of clusters in labels, ignoring noise if present.
ms_n_clusters_ = len(set(ms_labels)) - (1 if -1 in ms_labels else 0)
print('Estimated number of clusters: %d' % ms_n_clusters_)

unique, counts = np.unique(ms_labels, return_counts=True)
print('counts')
print(np.asarray((unique, counts)).T)

Estimated number of clusters: 1
counts
[[   -1  6335]
 [    0 39322]]


In [9]:
# DBSCAN MODEL
db = None
db = DBSCAN(eps=0.01, min_samples=25, metric='cosine', leaf_size=10, n_jobs=-1).fit(lda_model)

labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)

unique, counts = np.unique(labels, return_counts=True)
print('counts')
print(np.asarray((unique, counts)).T)

Estimated number of clusters: 2
counts
[[   -1  6116]
 [    0 36081]
 [    1  3474]]
