# Import necessary dependencies

In [9]:
import warnings
import numpy as np
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import utils
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()

total_topics = 10


# Load and normalize data

In [2]:
positive_reviews = utils.readFromDisk('reviews_positive')
negative_reviews = utils.readFromDisk('reviews_negative')



                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# Topic modeling of positive reviews

In [None]:
# get tf-idf features for only positive reviews
ptvf = TfidfVectorizer(max_df=0.95, min_df=2, max_features=100, stop_words='english',use_idf=True, ngram_range=(1,2), sublinear_tf=True)
#TfidfVectorizer(use_idf=True, min_df=2, max_df=0.95, ngram_range=(1,2), sublinear_tf=True)
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tf-idf features for only negative reviews
ntvf = TfidfVectorizer(max_df=0.95, min_df=2, max_features=100, stop_words='english',use_idf=True, ngram_range=(1,2), sublinear_tf=True)
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)


In [None]:
# build topic model on positive sentiment review features 
pos_nmf =NMF(n_components=total_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
#NMF( n_components = total_topics, solver ="cd", max_iter = 500, random_state = 42, alpha =.1, l1_ratio =.85)
pos_nmf.fit( ptvf_features) 
# extract features and component weights 
pos_feature_names = np.array( ptvf.get_feature_names()) 
pos_weights = pos_nmf.components_ 

In [None]:
# extract and display topics and their components 
pos_feature_names = np.array( ptvf.get_feature_names()) 
feature_idxs = np.argsort(-pos_weights)[:, :15] 
topics = [pos_feature_names[ idx] for idx in feature_idxs] 
for idx, topic in enumerate( topics):
    print(' Topic #' + str( idx + 1) +':')
    print(', '. join( topic))
    print()

In [None]:
#visualize
pyLDAvis.sklearn.prepare( pos_nmf, ptvf_features, ptvf,mds='mmds')
