In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
#df = pd.read_pickle('lemmatized.pkl')
df = pd.read_pickle('nltk_stemmed.pkl')
df.columns

Index(['Text', 'Text_Processed', 'REFERENCE_ID', 'YEAR', 'TITLE'], dtype='object')

In [3]:
#Vectorize data stored in text
from sklearn.feature_extraction.text import CountVectorizer

df['Text_Processed_str'] = df['Text_Processed'].str.join(sep = ' ')

vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(df['Text_Processed_str'])

In [5]:
#Tfidf encoding
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(x_counts)

In [7]:
#NMF topic modeling
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20
model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_train_tfidf.T)

nmf = model.fit(X_train_tfidf.T)

#nmf = LatentDirichletAllocation(n_components=no_topics, random_state=100).fit(X_train_tfidf.T)

In [None]:
#Dimensionality reduction
from sklearn.manifold import TSNE
nmf_embedded = TSNE(n_components=3, perplexity=40).fit_transform(nmf.components_.T)

In [None]:
#join all data back together
reduced = pd.DataFrame(nmf_embedded)
jdf = reduced.join(df)

#Match categories to original titles
topic_values = model.fit_transform(X_train_tfidf)
topic_values.shape
nmf.components_.shape
jdf['category'] = topic_values.argmax(axis=1)

In [None]:
# get top words ###Fix meeeeeeee##
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(no_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        wordstring = str("")
        for word in words:
            wordstring += word + " "
        word_dict[i] = wordstring
    
    return pd.Series(word_dict);

top_words = pd.DataFrame(get_nmf_topics(nmf, 5), columns = ['top words'])
export = pd.merge(top_words, jdf, right_on='category', left_index=True)

In [None]:
export.to_pickle('clustered.pkl')

In [None]:
export.to_csv('clustered.csv')

In [None]:
help(nmf.fit_transform(X_train_tfidf))

In [None]:
nmf.classes_

In [17]:
help(TSNE)

Help on class TSNE in module sklearn.manifold._t_sne:

class TSNE(sklearn.base.BaseEstimator)
 |  t-distributed Stochastic Neighbor Embedding.
 |  
 |  t-SNE [1] is a tool to visualize high-dimensional data. It converts
 |  similarities between data points to joint probabilities and tries
 |  to minimize the Kullback-Leibler divergence between the joint
 |  probabilities of the low-dimensional embedding and the
 |  high-dimensional data. t-SNE has a cost function that is not convex,
 |  i.e. with different initializations we can get different results.
 |  
 |  It is highly recommended to use another dimensionality reduction
 |  method (e.g. PCA for dense data or TruncatedSVD for sparse data)
 |  to reduce the number of dimensions to a reasonable amount (e.g. 50)
 |  if the number of features is very high. This will suppress some
 |  noise and speed up the computation of pairwise distances between
 |  samples. For more tips see Laurens van der Maaten's FAQ [2].
 |  
 |  Read more in the