In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
# nltk.download('wordnet')
import scipy as sp
import math
import numpy as np
import csv
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
netflix_df = pd.read_pickle("Netflix_Data")
netflix_df.shape

(693, 39)

In [3]:
netflix_df.head()

Unnamed: 0,reviewID,employerID,userID,gender,birthYear,highestEducation,metroID,metroName,stateID,stateName,countryID,jobTitleID,JobTitle,GOC,GOCconfidence,MGOC,MGOCconfidence,reviewDateTime,isCurrentJobFlag,jobEndingYear,OverallRating,CareerOpps,CompensationBenefits,SeniorLeadership,Worklife,CultureValues,RecommendFriend,BusinessOutlook,CEO,employerName,stockTicker,employerTypeCode,numberEmployees,annualRevenue,industry,sector,pros,cons,feedback
5564,4151950,11891,24353329,FEMALE,1984.0,BACHELORS,0,,0,,1,0,,,,,,2014-04-30 23:52:26.027,1,,4.0,3.0,5.0,3.0,2.0,3.0,YES,Same,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,You will be working with the most talented ppl...,Little bit politics in some teams.,
5841,1863,11891,-1,,,,761,San Jose,2280,CA,1,35739,"Director, Product Management",product manager,0.913,product manager,0.913,2008-04-23 23:42:17.157,1,,5.0,4.0,4.5,5.0,4.5,,YES,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,Freedom and responsibility. You're treated lik...,"Netflix is not for everyone. You don't get ""di...",I have none. Senior management is fantastic. s...
6452,4991,11891,2076,,,,761,San Jose,2280,CA,1,13321,Marketing Manager,marketing manager,1.0,marketing manager,1.0,2008-06-11 00:03:28.907,1,,5.0,5.0,5.0,5.0,4.5,,YES,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,Great colleagues -- incredible really,Domestic not global business -- wish we did eu...,"Focus on the customer, not on Apple"
14574,53799,11891,68043,,,,700,Portland,3163,OR,1,64668,Support Staff,support staff,1.0,retail representative,1.0,2008-08-07 23:30:14.267,0,2008.0,2.0,1.0,4.5,4.0,5.0,,NO,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,The upper management of Netflix really does se...,"Specific to the Hillsboro location, the middle...","To the senior-most management in Los Gatos, I ..."
14584,53937,11891,68207,,,,0,,0,,1,36451,Does IT Matter?,,0.0,,0.0,2008-08-08 09:12:42.493,0,2008.0,2.0,2.0,2.5,3.5,1.0,,NO,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,"The people there are fantastic, the service is...",It's frustrating to work for direct management...,"Stop being so secretive, just be upfront and h..."


In [4]:
netflix_df = netflix_df.fillna('')

In [5]:
data_pros = netflix_df['pros'].tolist()
data_pros[0]

'You will be working with the most talented ppl around.'

In [6]:
porter_stemmer = nltk.stem.PorterStemmer()
class PStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(PStemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([porter_stemmer.stem(w) for w in analyzer(doc)])

In [7]:
snowball_stemmer = nltk.stem.SnowballStemmer('english')
class SBStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SBStemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([snowball_stemmer.stem(w) for w in analyzer(doc)])

In [8]:
lemma = nltk.wordnet.WordNetLemmatizer()
class LemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemma.lemmatize(w) for w in analyzer(doc)])

In [9]:
def read_path(path):
    import os
    file_list=os.listdir(path)
    return file_list

In [10]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = 1./2*(p + q)
    return sp.stats.entropy(p, m, base=base) / 2. +  sp.stats.entropy(q, m, base=base) / 2.

In [11]:
def conth(prob_matrix_df): # function to measure content heterogeneity given a topic (prob) matrix
    N = prob_matrix_df.shape[0]
    probMatrix = prob_matrix_df.values
    conth = 1/((sum(map(sum, np.square(probMatrix))))/N)
    return conth

In [29]:
def comph(probMatrix_df): 
    #Transform probMatrix_df into 2D array
    probMatrix = probMatrix_df

    x = 0
    y = 0
    
    df = pd.DataFrame()
    for x in range(0, len(probMatrix)): 
        jsd_list = []
        for y in range(0, len(probMatrix)): 
            jsd_list.append(jsd(probMatrix[x], probMatrix[y]))
            y = y+1
        df[str(x)] = jsd_list


    #Get df lower diagonal
    mask = np.ones(df.shape,dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1)&mask]
    
    distance_list = []
    k = 0 
    for k in range(0, len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
        k = k + 1
    comph = sum(distance_list) / float(len(distance_list))
    return comph

In [30]:
def get_pros_clean(item):
    if item != "":
        item = item.lower().replace("netflix", " ")
        item = item.replace("show less", "")
        item = item.replace("show more", "")
        item = item.replace("\n", "")
        item_modified =  ''.join([i for i in item if not i.isdigit()])
    return item_modified

In [31]:
import concurrent.futures as cf

In [32]:
with cf.ProcessPoolExecutor() as executor:
    data_pros_cleaned = executor.map(get_pros_clean, data_pros)

In [33]:
data_pros_cleaned = list(data_pros_cleaned)

In [34]:
TotalWords_vectorizer = SBStemmedCountVectorizer(analyzer="word", stop_words='english')

In [35]:
TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros_cleaned)

In [36]:
totWords = len(TotalWords_vectorizer.get_feature_names())
totWords

1801

In [37]:
tf_vectorizer = SBStemmedCountVectorizer(max_df = 0.90, 
                                         min_df=0.01, 
                                         analyzer="word", 
                                         stop_words='english')

In [38]:
tf = tf_vectorizer.fit_transform(data_pros_cleaned)

In [39]:
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_names[:10]

['abil',
 'abl',
 'account',
 'actual',
 'adult',
 'allow',
 'amaz',
 'appreci',
 'approv',
 'area']

In [40]:
percVoc = len(tf_feature_names)/float(totWords)*100

In [41]:
i = 0    
output=np.zeros((60,3))

In [42]:
tf

<693x340 sparse matrix of type '<class 'numpy.int64'>'
	with 7641 stored elements in Compressed Sparse Row format>

In [None]:
%%time

for topics in range(2, 300, 5): 
    
    lda = LatentDirichletAllocation(n_components=topics, 
                                    max_iter=200, 
                                    learning_method='batch', 
                                    learning_offset=10.,
                                    evaluate_every=2,
                                    random_state=1234,
                                    n_jobs=-1)
    lda_fit = lda.fit(tf)
    #output normalized matrix with distributions of topics over words
    #normalized
    topicsOverWords = lda_fit.components_ / lda_fit.components_.sum(axis=1)[:, np.newaxis]
    topicsDissim_avg = comph(topicsOverWords)

    #store results per firm   
    output[i,0] = topics
    output[i,1] = topicsDissim_avg 
    output[i,2] = percVoc
  
    i = i+1

In [201]:
filename_save = ("TopicInterpretation_Netflix_Pros_OptimalTopics_Coherence")

In [202]:
results = open(filename_save + '.csv', 'w')

In [203]:
output

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0

In [204]:
writer = csv.writer(results)
writer.writerow(['topics', 'coherence', 'voc%'])
for values in output:
    writer.writerow(values)
results.close()

In [205]:
topics_coherence_df = pd.read_csv(filename_save + ".csv")

In [206]:
topics_coherence_df.head()

Unnamed: 0,topics,coherence,voc%
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


In [207]:
optimal_topics = topics_coherence_df.topics[topics_coherence_df.coherence.idxmax()]

In [208]:
optimal_topics

0.0

In [209]:
tf_vectorizer = SBStemmedCountVectorizer(max_df = 0.90, 
                                         min_df=0.01, 
                                         analyzer="word", 
                                         stop_words='english')

In [210]:
#vectorize data (learn the vocabulary dictionary and return term-document matrix)
tf = tf_vectorizer.fit_transform(data_pros_cleaned)
#tf = tf_vectorizer.fit_transform(data_cons_cleaned)
#    extract features
tf_feature_names = tf_vectorizer.get_feature_names()

In [211]:
lda = LatentDirichletAllocation(n_components=int(optimal_topics),
                                max_iter=200, 
                                learning_method='batch', 
                                learning_offset=10.,
                                evaluate_every=2,
                                random_state=1234,
                                n_jobs=-1)

In [142]:
#for parameters of lda function - visit here: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
lda_fit = lda.fit(tf)

In [144]:
tf

<693x340 sparse matrix of type '<class 'numpy.int64'>'
	with 7641 stored elements in Compressed Sparse Row format>

In [145]:
#generate matrix summarizing distribution of docs (reviews) over topics
probMatrix = lda.transform(tf)
docs_topics_df = pd.DataFrame(data = probMatrix, 
                              index=None, 
                              columns=None, 
                              dtype=None, 
                              copy=False)

In [147]:
docs_topics_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
0,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.676768,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101
1,0.000357,0.000357,0.988592,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357
2,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.367356,0.006061,0.006061,0.006061,0.006061,0.006061,0.444765,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061
3,0.000777,0.000777,0.000777,0.185923,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.124921,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.182382,0.000777,0.000777,0.000777,0.000777,0.48424,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777
4,0.001166,0.001166,0.32398,0.001166,0.001166,0.001166,0.199497,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.176313,0.001166,0.138484,0.001166,0.08446,0.001166,0.001166,0.001166,0.001166,0.001166,0.045797,0.001166,0.001166


In [148]:
def topicModel(corpus, topics):
    tf_vectorizer = SBStemmedCountVectorizer(max_df = 0.90, min_df=0.01, analyzer="word", stop_words='english')
    #vectorize data (learn the vocabulary dictionary and return term-document matrix)
    tf = tf_vectorizer.fit_transform(corpus)
    
    #for parameters of lda function - visit here: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
    lda = LatentDirichletAllocation(n_components=topics, 
                                max_iter=200, 
                                learning_method='batch', 
                                learning_offset=10.,
                                evaluate_every=2,
                                random_state=1234,
                                n_jobs=-1)
    #Fit lda model according to the given training data and parameters
    lda_fit = lda.fit(tf)
    
    #Output: Distribution of topics per document (project data to maximize class separation)
    probMatrix = lda.transform(tf)
    #Transform superCorpus_theta into pandas df
    probMatrix_df = pd.DataFrame(data = probMatrix, index=None, columns=None, dtype=None, copy=False)
    
    return probMatrix_df

In [168]:
docs_over_topics = topicModel(data_pros_cleaned, int(optimal_topics))

In [169]:
docs_over_topics.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
0,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.676768,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101
1,0.000357,0.000357,0.988592,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357,0.000357
2,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.367356,0.006061,0.006061,0.006061,0.006061,0.006061,0.444765,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061,0.006061
3,0.000777,0.000777,0.000777,0.185923,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.124921,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.182382,0.000777,0.000777,0.000777,0.000777,0.48424,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777,0.000777
4,0.001166,0.001166,0.32398,0.001166,0.001166,0.001166,0.199497,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.001166,0.176313,0.001166,0.138484,0.001166,0.08446,0.001166,0.001166,0.001166,0.001166,0.001166,0.045797,0.001166,0.001166


In [170]:
comp_H = comph(docs_over_topics)
compH

In [172]:
%%time

conT_H = conth(docs_over_topics)
conT_H

CPU times: user 5.66 ms, sys: 824 µs, total: 6.48 ms
Wall time: 5.91 ms
