In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
# nltk.download('wordnet')
import scipy as sp
import math
import numpy as np
import csv
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

%matplotlib inline

In [None]:
netflix_df = pd.read_pickle("Netflix_Data")
netflix_df.shape

In [None]:
netflix_df.head()

In [None]:
data_pros = netflix_df['pros'].values
data_pros[0:4]

In [163]:
def root_of_word(docs, root_word_method='lemma'):
    
    porter_stemmer = nltk.stem.PorterStemmer()
    snowball_stemmer = nltk.stem.SnowballStemmer('english')
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    tokens = nltk.word_tokenize(docs)
    
    if root_word_method == 'lemma':
        doc = ' '.join([lemma.lemmatize(w) for w in tokens])
    elif root_word_method == 'stemm':
        doc = ' '.join([porter_stemmer.stem(w) for w in tokens])
    elif root_word_method == 'snowball':
        doc = ' '.join([snowball_stemmer.stem(w) for w in tokens])
        
    return doc

In [164]:
from functools import partial

In [165]:
stemming = partial(root_of_word, root_word_method='stemm')
snowball = partial(root_of_word, root_word_method='snowball')

In [166]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = 1./2*(p + q)
    return sp.stats.entropy(p, m, base=base) / 2. +  sp.stats.entropy(q, m, base=base) / 2.

In [167]:
def conth(prob_matrix_df): # function to measure content heterogeneity given a topic (prob) matrix
    N = prob_matrix_df.shape[0]
    probMatrix = prob_matrix_df.values
    conth = 1/((sum(map(sum, np.square(probMatrix))))/N)
    return conth

Needs to have an additional argument. Or be completely redisigned

In [168]:
def comph(probMatrix_df, arr_or_df='df'): 
    #Transform probMatrix_df into 2D array
    
    if arr_or_df == 'df':
        probMatrix = probMatrix_df
    else:
        probMatrix = probMatrix_df.values

    x = 0
    y = 0
    
    df = pd.DataFrame()
    for x in range(0, len(probMatrix)): 
        jsd_list = []
        for y in range(0, len(probMatrix)): 
            jsd_list.append(jsd(probMatrix[x], probMatrix[y]))
            y = y+1
        df[str(x)] = jsd_list


    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    k = 0 
    for k in range(0, len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
        k = k + 1
    comph = sum(distance_list) / float(len(distance_list))
    return comph

In [200]:
def get_pros_clean(item):
    if item != "":
        item = item.lower().replace("netflix", " ")
        item = item.replace("show less", "")
        item = item.replace("show more", "")
        item = item.replace("\n", "")
        item_modified =  ''.join([i for i in item if not i.isdigit()])
    return item_modified

In [201]:
import concurrent.futures as cf

In [202]:
with cf.ProcessPoolExecutor() as executor:
    data_pros_cleaned = executor.map(get_pros_clean, data_pros)
    data_pros_cleaned = list(data_pros_cleaned)

In [203]:
TotalWords_vectorizer = CountVectorizer(stop_words='english')
TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros_cleaned)
totWords = len(TotalWords_vectorizer.get_feature_names())
totWords

2492

In [204]:
with cf.ProcessPoolExecutor() as executor:
    snowball_pros_cleaned = executor.map(snowball, data_pros_cleaned)
    snowball_pros_cleaned = list(snowball_pros_cleaned)

In [None]:
tf_vectorizer = CountVectorizer(max_df = 0.90, min_df=0.01, stop_words='english')

tf = tf_vectorizer.fit_transform(snowball_pros_cleaned)

tf_feature_names = tf_vectorizer.get_feature_names()

In [211]:
percVoc = len(tf_feature_names) / totWords * 100
percVoc

13.603531300160512

In [143]:
%%time


i = 0    
output=np.zeros((60,3))

for topics in range(2, 300, 5): 
    
    lda = LatentDirichletAllocation(n_components=topics, 
                                    max_iter=200, 
                                    evaluate_every=2,
                                    random_state=1234,
                                    n_jobs=-1).
    lda_fit = lda.fit(tf)
    #output normalized matrix with distributions of topics over words
    #normalized
    topicsOverWords = lda_fit.components_ / lda_fit.components_.sum(axis=1)[:, np.newaxis]
    topicsDissim_avg = comph(topicsOverWords)

    #store results per firm   
    output[i,0] = topics
    output[i,1] = topicsDissim_avg 
    output[i,2] = percVoc
  
    i += 1

CPU times: user 4min 14s, sys: 28.3 s, total: 4min 43s
Wall time: 6min 27s


In [212]:
filename_save = ("TopicInterpretation_Netflix_Pros_OptimalTopics_Coherence.csv")

out_df = pd.DataFrame(output, columns=['topics', 'coherence', 'voc%'])
out_df.to_csv(filename_save, index=False)
out_df.head()

Unnamed: 0,topics,coherence,voc%
0,2.0,0.332338,17.955508
1,7.0,0.37894,17.955508
2,12.0,0.399296,17.955508
3,17.0,0.420066,17.955508
4,22.0,0.428053,17.955508


In [None]:
topics_coherence_df = pd.read_csv(filename_save)

In [215]:
optimal_topics = int(topics_coherence_df.loc[topics_coherence_df['coherence'].idxmax(), 'topics'])
optimal_topics

192

In [216]:
lda = LatentDirichletAllocation(n_components=int(optimal_topics),
                                max_iter=200, 
                                learning_method='batch', 
                                learning_offset=10.,
                                evaluate_every=2,
                                random_state=1234,
                                n_jobs=-1)

In [217]:
#generate matrix summarizing distribution of docs (reviews) over topics
probMatrix = lda.fit_transform(tf)

In [218]:
probMatrix

array([[1.73611111e-03, 1.73611111e-03, 1.73611111e-03, ...,
        1.73611111e-03, 1.73611111e-03, 1.73611111e-03],
       [6.20039683e-05, 6.20039683e-05, 6.20039683e-05, ...,
        6.20039683e-05, 6.20039683e-05, 6.20039683e-05],
       [1.04166667e-03, 1.04166667e-03, 1.04166667e-03, ...,
        1.04166667e-03, 1.04166667e-03, 1.04166667e-03],
       ...,
       [1.30208333e-03, 1.30208333e-03, 1.30208333e-03, ...,
        1.30208333e-03, 1.30208333e-03, 1.30208333e-03],
       [4.73484848e-04, 4.73484848e-04, 4.73484848e-04, ...,
        4.73484848e-04, 4.73484848e-04, 4.73484848e-04],
       [5.78703704e-04, 5.78703704e-04, 5.78703704e-04, ...,
        2.14779009e-01, 5.78703704e-04, 5.78703704e-04]])

In [219]:
docs_topics_df = pd.DataFrame(data = probMatrix)
docs_topics_df.head()

In [223]:
%%time

comp_H = comph(docs_topics_df, arr_or_df='arr')
comp_H

0.5969588015298183

In [224]:
%%time

conT_H = conth(docs_topics_df)
conT_H

CPU times: user 23.7 ms, sys: 477 µs, total: 24.1 ms
Wall time: 24.5 ms


2.269603512937508