# Text Analysis

## Goal: Create a model to find similar users based on each user's review text.



-----------

In [4]:
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import statsmodels.formula.api as sm
import pandas as pd
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.decomposition import PCA
import json
import gzip
from pandas import Series
from six.moves.html_parser import HTMLParser
from scipy import spatial
import heapq


%matplotlib inline

## Import data

## Text analysis - Class to get Similarity from Text Reviews

In [18]:
class TextSimilarity:
    def __init__(self, path): #user_rand_sample_1000.csv
        "Assigns data to data_df property"
        data_df = pd.read_csv(path)
        years = []
        months = []
        days = []
        for date in data_df['reviewTime']:
            splitted = date.split(' ')
            years.append(splitted[2])
            months.append(splitted[0])
            day = splitted[1]
            days.append(day[:-1])
        date_df = pd.DataFrame({'year': years,
                               'month': months,
                               'day': days})
        final_dates = pd.to_datetime(date_df)
        data_df['datetimes'] = final_dates
        print "Shape of our dataset: ", data_df.shape
        print "Number of unique users in the dataset: ",len(np.unique(data_df['reviewerID'].values))
        self.data_df = data_df
        
    def groupByUser(self):
        "Creates a new property data_combined that groups the dataframe by user"
        print "Grouping data by user...."
        data_sample = self.data_df[['reviewText','reviewerID','overall']]
        co2 = self._percentage_coroutine(len(data_sample.groupby('reviewerID')))
        co2.next()
        data_combined = data_sample.groupby('reviewerID').apply(self._trace_progress(self._groupByUser,progress=co2))
        h = HTMLParser()
        remove_html = lambda x: h.unescape(x)
        data_combined['reviewText'] = data_combined['reviewText'].map(remove_html)
        self.data_combined = data_combined
        
    def countWords(self):
        "Applies a count vectorizer and stores result to vectorized"
        vectorizer = CountVectorizer(stop_words = 'english',min_df=4, decode_error="replace")
        reviews = self.data_combined['reviewText'].values
        x = vectorizer.fit_transform(reviews)
        self.vectorized = x.toarray()
        self.all_feature_names = vectorizer.get_feature_names() 
        print len(self.all_feature_names), " unique words found in reviews"
        
    def calculateScores(self):
        "Calculates the TF-IDF based on the vectorized data"
        # Get IDF 
        document_frequency = np.sum(self.vectorized,axis=0)
        print "Calculated Document frequency"
        idf_raw = np.divide(float(self.vectorized.shape[0]),document_frequency)
        idf = np.log10(idf_raw)
        print "Calculated IDF"
        tf = 1 + np.log10(self.vectorized)
        tf[tf<0]=0
        self.tf_idf = np.multiply(tf,idf)


    def _groupByUser(self,x):
        return pd.Series(dict(reviewText = "%s" % ''.join(str(x['reviewText'].values))))
    
    def _trace_progress(self, func, progress = None):
        def callf(*args, **kwargs):
            if (progress is not None):
                progress.send(None)

            return func(*args, **kwargs)

        return callf
    
    def _percentage_coroutine(self, to_process, print_on_percent = 0.25):
        print "Starting progress percentage monitor"
        processed = 0
        count = 0
        print_count = to_process*print_on_percent
        while True:
            yield
            processed += 1
            count += 1
            if (count >= print_count):
                count = 0
                pct = (float(processed)/float(to_process))*100
                print "{}% finished".format(pct)
                
    def _getCosineMatrix(self):
        tf_idf = self.tf_idf
        cosine_sim_results = np.zeros((tf_idf.shape[0],tf_idf.shape[0]))
        for user1 in range(tf_idf.shape[0]):
            for user2 in range(tf_idf.shape[0]):
                cosine_sim_results[user1,user2] = spatial.distance.cosine(tf_idf[user1],tf_idf[user2])
            if user1%20==0: 
                print "Completed ",user1, " users"
                #print float(user1)/tf_idf.shape[0], cosine_sim_results[user1,user2]
        
        self.cosine_sim_results = cosine_sim_results
        
    def getCosineMatrix(self):
        "Returns a matrix of cosine similarity that is N-users by N-users"
        tf_idf = self.tf_idf
        magnitude =  np.linalg.norm(tf_idf, axis=1)
        tf_idf_norm = tf_idf/magnitude[:,None]
        self.cosine_sim_results = tf_idf_norm.dot(tf_idf_norm.T)
        return self.cosine_sim_results                            
                
    def getSimilarityMatrix(self):
        "Returns sorted array of most similar users for each user"
        neighbors = np.zeros((self.cosine_sim_results.shape[0],10))
        count=0
        for row in self.cosine_sim_results:
            # get the indices that would sort the row
            # replace values with the index
            # sort the rows
            closest10 = heapq.nsmallest(10, range(len(row)), row.take)
            neighbors[count] = closest10
            count+=1
        self.neighbors = neighbors
        print "Calculated Similarity Matrix"
        return self.neighbors
    
    def fit(self):
        "Perform all calculations"
        self.groupByUser()
        self.countWords()
        self.calculateScores()
        text_cosine_matrix = self.getCosineMatrix()
        self.getSimilarityMatrix()
        
    def searchReviews(self,search_term): 
        index = self.all_feature_names.index(search_term)
        first_user_with_term = np.argmax(self.vectorized[:,index]>0)
        self.printNeighbors(first_user_with_term)
        #print("We haven't calculated that one yet")
            
    def printNeighbors(self,user):
        for each in self.neighbors[user,0:6]:
            print "\nUser: ", each, "Cosine: ", self.cosine_sim_results[user,each]
            print self.data_combined['reviewText'][int(each)]
            print "--------"


In [19]:
textSimilarity = TextSimilarity('user_rand_sample_1000.csv')
textSimilarity.fit()

Shape of our dataset:  (12471, 12)
Number of unique users in the dataset:  1000
Grouping data by user....
Starting progress percentage monitor
25.0% finished
50.0% finished
75.0% finished
100.0% finished
9401  unique words found in reviews
Calculated Document frequency
Calculated IDF
Calculated Similarity Matrix


In [20]:
# Get cosine similarity matrix 
textSimilarity.cosine_sim_results

array([[ 1.        ,  0.20449872,  0.17545035, ...,  0.17896626,
         0.09904113,  0.0926708 ],
       [ 0.20449872,  1.        ,  0.25197203, ...,  0.21635667,
         0.09100937,  0.10615786],
       [ 0.17545035,  0.25197203,  1.        , ...,  0.20284616,
         0.075706  ,  0.08639352],
       ..., 
       [ 0.17896626,  0.21635667,  0.20284616, ...,  1.        ,
         0.10148614,  0.06044012],
       [ 0.09904113,  0.09100937,  0.075706  , ...,  0.10148614,
         1.        ,  0.0481088 ],
       [ 0.0926708 ,  0.10615786,  0.08639352, ...,  0.06044012,
         0.0481088 ,  1.        ]])

In [21]:
# Get similarity matrix (for each user, shows sorted list of most similar users)
textSimilarity.neighbors

array([[ 512.,  309.,  450., ...,  617.,  994.,  991.],
       [ 512.,  788.,  309., ...,  245.,  941.,  991.],
       [ 512.,  788.,  861., ...,  309.,  617.,  245.],
       ..., 
       [ 512.,  309.,  861., ...,  110.,  245.,  450.],
       [ 512.,  450.,   46., ...,  552.,  472.,  406.],
       [ 512.,  763.,  919., ...,  758.,  299.,   46.]])