In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, f1_score, hamming_loss, mean_absolute_error
from sklearn.metrics import multilabel_confusion_matrix, precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import itertools
import en_core_web_sm
import string
from nltk.tag import pos_tag
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
from gensim.models import Word2Vec
import random
from scipy import stats

### word embedding

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guandajiang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stemmer = SnowballStemmer('english')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guandajiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
df_1200 = pd.read_csv('Data/1200.csv')
y = df_1200['overall']
X = df_1200.iloc[:,1:7]

In [None]:
data = []
for text in range(df_1200.shape[0]):
    s = df_1200['reviewText'][text] 

    # Replaces escape character with space 
    f = s.replace("\n", " ") 
 

    # iterate through each sentence in the file 
    for i in sent_tokenize(f): 
        temp = [] 

        # tokenize the sentence into words 
        for j in word_tokenize(i):
            j = lemmatize_stemming(j)
            temp.append(j.lower()) 

        data.append(temp)

In [None]:
words = list(itertools.chain.from_iterable(data))

In [None]:
len(words)

367781

In [None]:
model1 = gensim.models.Word2Vec(data,min_count = 1,size = 100, window = 5)

In [None]:
category = ['color','size', 'qualiti','comfort','price','materi']
similarity = defaultdict(list)
for i in category:
    for j in words:
        similarity[i].append(model1.similarity(i,j))

In [None]:
df_sim = pd.DataFrame(similarity)
df_sim.index = words

In [None]:
df_sim

Unnamed: 0,color,size,qualiti,comfort,price,materi
good,0.130726,-0.021617,0.601772,0.666703,0.558682,0.599255
price,0.417415,0.114226,0.829234,0.381946,1.000000,0.568117
.,0.142069,0.141743,0.233047,0.223263,0.136451,0.306145
comfort,0.335236,0.063729,0.543135,1.000000,0.381946,0.692009
.,0.142069,0.141743,0.233047,0.223263,0.136451,0.306145
...,...,...,...,...,...,...
all,0.178531,-0.092414,0.237093,0.220900,0.381846,0.236102
around,-0.126808,-0.010855,-0.060926,0.023871,-0.026496,0.168089
great,0.238396,-0.080696,0.526650,0.633136,0.482651,0.482689
hoodi,0.415866,0.244018,0.455760,0.229640,0.459177,0.405164


In [None]:
list_words = []
for i in category:
    similar_words = df_sim[i].loc[df_sim[i]>0.6].index
    similar_words = np.unique(similar_words)
    list_words.append(similar_words)

In [None]:
color_words = list_words[0]
size_words = list_words[1]
quality_words = list_words[2]
comfort_words = list_words[3]
price_words = list_words[4]
material_words = list_words[5]

In [None]:
# the following sentiment words was collected by Minqing Hu and Bing Liu from
# https://github.com/shekhargulati/sentiment-analysis-python/blob/master/opinion-lexicon-English/positive-words.txt
f = open("data/positive_words.txt", "r")
pos = f.read().split('\n')
f = open("data/negative_words.txt", "r")
neg = f.read().split('\n')

In [None]:
# the following code was learned from an tutorial from 
#     Intellica.AI,"Aspect-based Sentiment Analysis - Everthing You Wanted to Know"
#     https://medium.com/@Intellica.AI/aspect-based-sentiment-analysis-everything-you-wanted-to-know-1be41572e238

def feature_sentiment(sentence, pos, neg):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature
              did not exist previously,then updates sentiment to
              each of the new or existing features
    output: updated dictionary
    '''
    sent_dict = dict()
    nlp = en_core_web_sm.load()
    sentence = nlp(sentence)
    opinion_words = neg + pos
    debug = 0
    
    for token in sentence:
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] = sentiment
                
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    #if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        #sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] = sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] = sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    #if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        #sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] = sentiment
                    debug += 1
    return sent_dict

In [None]:
df_1200['reviewText'] = df_1200['reviewText'].str.lower()

In [None]:
sentiment = []
for i in df_1200['reviewText']:
    sentiment.append(feature_sentiment(i, pos, neg))

In [None]:
overall_rating = []

for i in range(df_1200.shape[0]):
    sentiment_dict = {}
    
    if len(sentiment[i].keys()) != 0:
        
        for key in sentiment[i].keys():
            root = preprocess(key)
            
            if root in color_words:
                sentiment_dict['color'] = sentiment[i][key]
                
            if root in size_words:
                sentiment_dict['size'] = sentiment[i][key]

            if root in quality_words:
                sentiment_dict['qualiti'] = sentiment[i][key]

            if root in comfort_words:
                sentiment_dict['comfort'] = sentiment[i][key]

            if root in price_words:
                sentiment_dict['price'] = sentiment[i][key]

            if root in material_words:
                sentiment_dict['materi'] = sentiment[i][key]

            else:
                sentiment_dict['NA'] = 0  
        
    else:
        sentiment_dict['NA'] = 0
    
    overall_rating.append(sentiment_dict)

In [None]:
rate_by_machine = pd.DataFrame(overall_rating)
rate_by_machine = rate_by_machine.drop(columns = 'NA')
rate_by_machine = rate_by_machine.fillna(0)
rate_by_machine

Unnamed: 0,color,comfort,materi,qualiti,price,size
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1186,0.0,0.0,0.0,1.0,1.0,0.0
1187,0.0,0.0,0.0,1.0,1.0,0.0
1188,0.0,0.0,0.0,0.0,0.0,0.0
1189,0.0,0.0,0.0,0.0,0.0,0.0


### Evaluation

In [None]:
def evaluate(y_true, y_pred, metric, average):
    result = defaultdict(list)
    for i in category:
        result[i].append(metric(y_true[i],y_pred[i], average = average))
    
    df_result = pd.DataFrame(result)
    
    return df_result

In [None]:
# random
random_dict = {}
for i in category:
    weight = df_1200[i].value_counts(normalize = True)
    random_score = random.choices([0,1,-1],
                                  weights = [weight.iloc[0],weight.iloc[1],weight.iloc[2]],k=df_1200.shape[0])
    random_dict[i] = random_score
    
df_random = pd.DataFrame(random_dict)

**Precision**

In [None]:
prec = pd.concat([evaluate(X,rate_by_machine,precision_score, 'weighted'),
           evaluate(X,df_random,precision_score, 'weighted')]).T
prec.columns = ['Prediction','Random']
prec['difference'] = prec['Prediction'] - prec['Random']
prec

Unnamed: 0,Prediction,Random,difference
color,0.89108,0.717179,0.173901
size,0.543663,0.362626,0.181037
qualiti,0.752166,0.63316,0.119007
comfort,0.658023,0.546517,0.111506
price,0.795687,0.668947,0.12674
materi,0.680011,0.585982,0.094029


**Recall**

In [None]:
recall = pd.concat([evaluate(X,rate_by_machine,recall_score, 'weighted'),
           evaluate(X,df_random,precision_score, 'weighted')]).T
recall.columns = ['Prediction','Random']
recall['difference'] = recall['Prediction'] - recall['Random']
recall

Unnamed: 0,Prediction,Random,difference
color,0.901763,0.717179,0.184585
size,0.473552,0.362626,0.110926
qualiti,0.731318,0.63316,0.098159
comfort,0.68094,0.546517,0.134423
price,0.790092,0.668947,0.121145
materi,0.671704,0.585982,0.085722


**F1 score**

In [None]:
f1 = pd.concat([evaluate(X,rate_by_machine,f1_score, 'weighted'),
           evaluate(X,df_random,precision_score, 'weighted')]).T
f1.columns = ['Prediction','Random']
f1['difference'] = f1['Prediction'] - f1['Random']
f1

Unnamed: 0,Prediction,Random,difference
color,0.890934,0.717179,0.173755
size,0.349154,0.362626,-0.013472
qualiti,0.733985,0.63316,0.100825
comfort,0.643395,0.546517,0.096878
price,0.792785,0.668947,0.123838
materi,0.669538,0.585982,0.083556


**Accuracy Score**

In [None]:
accuracy_random = defaultdict(list)
accuracy_machine = defaultdict(list)

for i in category:
    accuracy_random[i].append(accuracy_score(df_1200[i],df_random[i]))
    accuracy_machine[i].append(accuracy_score(df_1200[i],rate_by_machine[i]))

In [None]:
acc = pd.concat([pd.DataFrame(accuracy_machine),
                 pd.DataFrame(accuracy_random)]).T
acc.columns = ['Prediction','Random']
acc['difference'] = acc['Prediction'] - acc['Random']
acc

Unnamed: 0,Prediction,Random,difference
color,0.901763,0.70361,0.198153
size,0.473552,0.36272,0.110831
qualiti,0.731318,0.628883,0.102435
comfort,0.68094,0.539043,0.141898
price,0.790092,0.652393,0.137699
materi,0.671704,0.593619,0.078086


### Significant Test

In [None]:
# using accuracy score as the skill estimate
stats.ttest_rel(acc['Prediction'],acc['Random'])

Ttest_relResult(statistic=7.549701268243345, pvalue=0.0006460959427428329)

In [None]:
# using recall score as the skill estimate
stats.ttest_rel(recall['Prediction'],recall['Random'])

Ttest_relResult(statistic=8.60613949743275, pvalue=0.0003494649511813355)

In [None]:
# using f1 score as the skill estimate
stats.ttest_rel(f1['Prediction'],f1['Random'])

Ttest_relResult(statistic=3.7477640831721537, pvalue=0.013324396849565445)

**P-value is really small for metrics accuracy score, precision score and recall score,\
and relatively small for the f1 score, but all of them are lower than the threshold 0.05,\
therefore, we can reject the null hypothesis that there is no significant difference between trained model and random model.**