In [1]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
files = ['amazon_reviews_us_Electronics_v1_00.tsv', \
         'amazon_reviews_us_Gift_Card_v1_00.tsv', \
         'amazon_reviews_us_Major_Appliances_v1_00.tsv', \
         'amazon_reviews_us_Office_Products_v1_00.tsv', \
         'amazon_reviews_us_Shoes_v1_00.tsv', \
         'amazon_reviews_us_Toys_v1_00.tsv', \
         'amazon_reviews_us_Watches_v1_00.tsv',
         'amazon_reviews_us_Apparel_v1_00.tsv']

In [5]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv(files[-1], names =  columns, sep = '\t').iloc[1:,:]

In [6]:
elec_df.shape

(5881951, 15)

In [7]:
elec_df = elec_df.sample(n = 1_000_000)

In [8]:
one_file = elec_df.copy()

In [9]:
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1856531,US,51467496,R2LOY5YE18L97V,B00GY77CMW,368001328,Ripple Junction Attack On Titan Jr. Character Montage Juniors T-Shirt,Apparel,5,0,0,N,Y,This is my niece's favorite anime of all times,This is my niece's favorite anime of all times. She LOVED this shirt. It fit perfect and put a smile on her face. I do NOT regret this purchase at all,2015-03-05
586636,US,18812429,RA4VQR26CTUGR,B00O0MQNRS,516974930,Ninimour- Animal Pyjamas Halloween Costume Onesie-Pikachu,Apparel,5,0,0,N,Y,The ladies love the thundershock they receive from this pikachu,"So cozy. I'm wearing it now, and it's scored me tons of dates already! The ladies love the thundershock they receive from this pikachu!",2015-02-01
1203547,US,48944408,RHE8IAMX3HFRE,B00KCF52JM,725287769,Calvin Klein Little Boys' Ck Recon Plaid Long-Sleeve Button-Front Shirt,Apparel,5,0,0,N,Y,Bellísima esta camisa la tela excelente me,Bellísima esta camisa la tela excelente me encanto,2014-11-02
3898188,US,45124252,RQ36TWXAIFK3X,B007G4WLBY,502039026,Timberland Men's Overdyed Cotton Web Belt,Apparel,4,1,1,N,Y,top notch,good quality. Not for use when working and bending over. It slips and will have to be tightened constantly. great for casual use.,2013-04-03
5763164,US,49227794,R31AQW9QMEMKH4,B000AJJDQS,23488898,Jockey Women's Underwear Plus Size Elance Brief - 3 Pack,Apparel,5,0,0,N,Y,Great Quality.,I am always looking for the perfect cotton panty. I'm very pleased with the quality construction and fit of these. They fit best after a wash and dry cycle. Nice neutral colors too.,2013-08-30


## Data Sampling

In [10]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [11]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 899241
Number of unverified purchases: 100759
Number of verified purchases (balanced dataset): 100759
Number of unverified purchases (balanced dataset): 100759


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1102939,US,21211758,RBZUAVZAIQ640,B00KSKBVUU,314092988,Harley-Davidson Men's Black Label Dark Stitch Short Sleeve T-Shirt Gray 30291526,Apparel,1,1,1,N,N,do not waste your money,"piss poor quality, definitely not a Harley shirt... do not waste your time and money",2015-08-28
1139122,US,40879549,R316M4VFYMRJ3E,B00KMFITK6,216527545,Womens Coffee Keeps Me Going T-Shirt Large,Apparel,1,0,0,N,N,One Star,Shirt was waay too small and returning it turned out to be a hassle.,2014-09-14


In [12]:
print("The Office Products sample will have size:", balanced_elec.shape)

The Office Products sample will have size: (201518, 15)


## Data Cleaning & Type Conversion

In [13]:
#one_file = balanced_elec.copy()

In [14]:
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 899241
Number of unverified purchases (balanced dataset): 100759


In [15]:
# Star rating of the format '2015-04-02' exists! In addition, np.nan exists in the helpful and total votes
one_file['star_rating'].value_counts()

5    556742
4    191752
3    104751
1     74055
2     61567
5      6140
4      1915
1      1264
3      1123
2       691
Name: star_rating, dtype: int64

In [16]:
# Convert the type of relevant columns to int
one_file['star_rating'] = (one_file['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
one_file['star_rating'] = one_file['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
one_file['helpful_votes'] = one_file['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
one_file['total_votes'] = one_file['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))

In [17]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

In [19]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
1856531,This is my niece's favorite anime of all times. She LOVED this shirt. It fit perfect and put a smile on her face. I do NOT regret this purchase at all,niece favorite anime time loved shirt fit perfect put smile face regret purchase
586636,"So cozy. I'm wearing it now, and it's scored me tons of dates already! The ladies love the thundershock they receive from this pikachu!",cozy im wearing now scored ton date already lady love thundershock receive pikachu
1203547,Bellísima esta camisa la tela excelente me encanto,bellsima esta camisa la tela excelente encanto
3898188,good quality. Not for use when working and bending over. It slips and will have to be tightened constantly. great for casual use.,good quality use working bending over slip tightened constantly great casual use
5763164,I am always looking for the perfect cotton panty. I'm very pleased with the quality construction and fit of these. They fit best after a wash and dry cycle. Nice neutral colors too.,always looking perfect cotton panty im pleased quality construction fit these fit best wash dry cycle nice neutral color too


In [20]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
1856531,This is my niece's favorite anime of all times,niece favorite anime time
586636,The ladies love the thundershock they receive from this pikachu,lady love thundershock receive pikachu
1203547,Bellísima esta camisa la tela excelente me,bellsima esta camisa la tela excelente
3898188,top notch,top notch
5763164,Great Quality.,great quality


In [21]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
1856531,Ripple Junction Attack On Titan Jr. Character Montage Juniors T-Shirt,ripple junction attack titan jr character montage junior tshirt
586636,Ninimour- Animal Pyjamas Halloween Costume Onesie-Pikachu,ninimour animal pyjama halloween costume onesiepikachu
1203547,Calvin Klein Little Boys' Ck Recon Plaid Long-Sleeve Button-Front Shirt,calvin klein little boy ck recon plaid longsleeve buttonfront shirt
3898188,Timberland Men's Overdyed Cotton Web Belt,timberland men overdyed cotton web belt
5763164,Jockey Women's Underwear Plus Size Elance Brief - 3 Pack,jockey woman underwear plus size elance brief pack


In [22]:
print("Number of verified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 899099
Number of unverified purchases (balanced dataset): 100748


## Vader Sentiment Analysis

In [23]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [24]:
one_file = cleaned2.copy()

In [25]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [26]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [27]:
one_file.head()
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 899099
Number of unverified purchases (balanced dataset): 100748


## Confusion Matrix

In [28]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [29]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [30]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [31]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

In [32]:
def helpful_prop(df):  
    vote_series = pd.Series(df['helpful_votes'] / df['total_votes'])
    # !! All nan values in votes_prop should be changed to zero: this means that 0/0 occurred !!
    vote_series = vote_series.fillna(0)
    return vote_series

In [33]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [34]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [35]:
one_file['help_prop'] = helpful_prop(one_file)
one_file['rev_bod_id'] = one_file['rev_dict'].apply(id_for_dictionary)
one_file['help_prop_id'] = one_file["help_prop"].apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,rev_dict,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
1856531,US,51467496,R2LOY5YE18L97V,B00GY77CMW,368001328,Ripple Junction Attack On Titan Jr. Character Montage Juniors T-Shirt,Apparel,5,0,0,N,Y,This is my niece's favorite anime of all times,This is my niece's favorite anime of all times. She LOVED this shirt. It fit perfect and put a smile on her face. I do NOT regret this purchase at all,2015-03-05,niece favorite anime time loved shirt fit perfect put smile face regret purchase,niece favorite anime time,ripple junction attack titan jr character montage junior tshirt,"{'neg': 0.11, 'neu': 0.276, 'pos': 0.614, 'compound': 0.9153}",0.11,0.276,0.614,0.9153,-0.4767,0.4588,0.0,1,-1
586636,US,18812429,RA4VQR26CTUGR,B00O0MQNRS,516974930,Ninimour- Animal Pyjamas Halloween Costume Onesie-Pikachu,Apparel,5,0,0,N,Y,The ladies love the thundershock they receive from this pikachu,"So cozy. I'm wearing it now, and it's scored me tons of dates already! The ladies love the thundershock they receive from this pikachu!",2015-02-01,cozy im wearing now scored ton date already lady love thundershock receive pikachu,lady love thundershock receive pikachu,ninimour animal pyjama halloween costume onesiepikachu,"{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.6369}",0.0,0.741,0.259,0.6369,0.0,0.6369,0.0,0,-1
1203547,US,48944408,RHE8IAMX3HFRE,B00KCF52JM,725287769,Calvin Klein Little Boys' Ck Recon Plaid Long-Sleeve Button-Front Shirt,Apparel,5,0,0,N,Y,Bellísima esta camisa la tela excelente me,Bellísima esta camisa la tela excelente me encanto,2014-11-02,bellsima esta camisa la tela excelente encanto,bellsima esta camisa la tela excelente,calvin klein little boy ck recon plaid longsleeve buttonfront shirt,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,-1
3898188,US,45124252,RQ36TWXAIFK3X,B007G4WLBY,502039026,Timberland Men's Overdyed Cotton Web Belt,Apparel,4,1,1,N,Y,top notch,good quality. Not for use when working and bending over. It slips and will have to be tightened constantly. great for casual use.,2013-04-03,good quality use working bending over slip tightened constantly great casual use,top notch,timberland men overdyed cotton web belt,"{'neg': 0.0, 'neu': 0.506, 'pos': 0.494, 'compound': 0.8316}",0.0,0.506,0.494,0.8316,0.0,0.2023,1.0,0,1
5763164,US,49227794,R31AQW9QMEMKH4,B000AJJDQS,23488898,Jockey Women's Underwear Plus Size Elance Brief - 3 Pack,Apparel,5,0,0,N,Y,Great Quality.,I am always looking for the perfect cotton panty. I'm very pleased with the quality construction and fit of these. They fit best after a wash and dry cycle. Nice neutral colors too.,2013-08-30,always looking perfect cotton panty im pleased quality construction fit these fit best wash dry cycle nice neutral color too,great quality,jockey woman underwear plus size elance brief pack,"{'neg': 0.0, 'neu': 0.429, 'pos': 0.571, 'compound': 0.9559}",0.0,0.429,0.571,0.9559,0.0,0.6249,0.0,1,-1


In [36]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating            int64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [37]:
imp_col = imp_col.dropna()

In [38]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [39]:
X

array([[-0.4767,  5.    ,  0.4588,  1.    , -1.    ],
       [ 0.    ,  5.    ,  0.6369,  0.    , -1.    ],
       [ 0.    ,  5.    ,  0.    ,  0.    , -1.    ],
       ...,
       [ 0.    ,  4.    , -0.128 ,  0.    , -1.    ],
       [-0.2732,  5.    ,  0.8555,  1.    , -1.    ],
       [ 0.296 ,  5.    ,  0.4939,  0.    ,  1.    ]])

In [40]:
print("Number of verified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 899099
Number of unverified purchases (balanced dataset): 100748


In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [42]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=20, p=1)

In [43]:
y_pred = knn_classifier.predict(X_test)

In [44]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [45]:
print(cm)
print(ac)

[[    13  20324]
 [    26 179607]]
0.8982347352102815


In [46]:
from joblib import dump, load

In [47]:
name = 'knn_apparel_model.joblib'
dump(knn_classifier, name)

['knn_apparel_model.joblib']

In [48]:
knn_classifier = load(name)

## Getting Reviews

In [98]:
# Getting the indices of the reviews that were predicted correctly
ind1 = list((y_test == y_pred) & (y_pred == 'Y'))#.index(True)
# & (y_test == 'N')
indexes = [i for i, x in enumerate(ind1) if x]
indexes

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 52,
 53,
 54,
 55,
 56,
 57,
 59,
 60,
 61,
 62,
 63,
 65,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 75,
 76,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 93,
 94,
 95,
 96,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 183,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 

In [99]:
95596*2

191192

In [100]:
len(indexes)

179607

In [107]:
import random
random_num = random.choice(indexes)
print(random_num)
ind1 = random_num
#1280
X_test[ind1]

6754


array([ 0.6249,  4.    ,  0.4215,  0.    , -1.    ])

In [108]:
test = list(X)
test;

In [109]:
test2 = (test == X_test[ind1]) 
test2

array([[False, False, False, False,  True],
       [False, False, False,  True,  True],
       [False, False, False,  True,  True],
       ...,
       [False,  True, False,  True,  True],
       [False, False, False, False,  True],
       [False, False, False,  True, False]])

def which_switch(ind2):
    x = [ind2.index(i) for i in ind2 if i == True]
    return x
which_switch(ind2)

In [110]:
ind2 = list(np.all(test2, axis = 1))#.index(True)
ind2
n_lst = [i for i, x in enumerate(ind2) if x]
(n_lst)

[61919,
 193208,
 205060,
 229340,
 237416,
 243413,
 364084,
 442123,
 462873,
 467127,
 579238,
 582321,
 610835,
 617528,
 703084,
 745057,
 771478,
 907780]

In [106]:
#23 Y; 88 Y; 
test_1 = one_file.iloc[130587].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]
test_1

Unnamed: 0,3342041
review_body,"This was intended as a Christmas gift and I was not at all impressed with the condition of the item. The design was somewhat faded and it had such an ordor, I had to wash it before wrapping. Definitely wont be ordering again from this seller."
review_headline,Not happy
product_title,Juniors: Batman - Action Duo Juniors (Slim) T-Shirt Size M
star_rating,1
helpful_votes,0
total_votes,0
verified_purchase,Y


In [114]:
test_2 = one_file.iloc[442123].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]
test_2

Unnamed: 0,1556954
review_body,I ordered a 34X30. They were a little long and darker then the picture shown? Excellent price too
review_headline,Nice Jeans
product_title,"Calvin Klein Jeans Men's Relaxed Straight Leg Jean, Osaka Blue, 34x30"
star_rating,4
helpful_votes,0
total_votes,0
verified_purchase,Y


## Getting Predictions

In [58]:
print('''Disney Animators' Collection Elsa Doll - 16"''')

Disney Animators' Collection Elsa Doll - 16"


In [59]:
review_body = 'Arrived very quickly, well packaged. Followed the provided instructions for installation and the emailed instructions for reset. Ran the internal print test and font list and the image looks good. Happy so far.'
review_title = 'Great Initial Results'
product_title = 'EPS Replacement Brother TN450 Toner Cartridge, High Yield (2,600 Yield) - Black'
star_rating = 5
helpful_votes = 0
total_votes = 0

In [60]:
test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

Unnamed: 0,review_body,review_title,product_title
0,"Arrived very quickly, well packaged. Followed the provided instructions for installation and the emailed instructions for reset. Ran the internal print test and font list and the image looks good. Happy so far.",Great Initial Results,"EPS Replacement Brother TN450 Toner Cartridge, High Yield (2,600 Yield) - Black"


In [61]:
out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

Unnamed: 0,review_body,review_title,product_title,new_review_body,new_review_title,new_product_title
0,"Arrived very quickly, well packaged. Followed the provided instructions for installation and the emailed instructions for reset. Ran the internal print test and font list and the image looks good. Happy so far.",Great Initial Results,"EPS Replacement Brother TN450 Toner Cartridge, High Yield (2,600 Yield) - Black",arrived quickly well packaged followed provided instruction installation emailed instruction reset ran internal print test font list image look good happy far,great initial result,eps replacement brother tn toner cartridge high yield yield black


In [62]:
rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
prop = 1
help_prop_id = id_for_prop(prop)
prod_title_comp = only_compound(out['new_product_title'][0])
rev_title_comp = only_compound(out['new_review_title'][0])

In [63]:
# The one we had originally, and is producing the wrong order of proportions
rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

array([[0.    , 5.    , 0.6249, 0.    , 1.    ]])

In [64]:
# The right one !!
#rev_input_test = np.array([[rev_title_comp, star_rating, prod_title_comp, rev_bod_id, help_prop_id]])
#rev_input_test

In [65]:
prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

prediction

prediction[0]
#probabilities

'Y'

In [115]:
test_2.to_dict()

{1556954: {'review_body': 'I ordered a 34X30.  They were a little long and darker then the picture shown?  Excellent price too',
  'review_headline': 'Nice Jeans',
  'product_title': "Calvin Klein Jeans Men's Relaxed Straight Leg Jean, Osaka Blue, 34x30",
  'star_rating': 4,
  'helpful_votes': 0,
  'total_votes': 0,
  'verified_purchase': 'Y'}}

In [116]:
test_1.to_dict()

{3342041: {'review_body': 'This was intended as a Christmas gift and I was not at all impressed with the condition of the item. The design was somewhat faded and it had such an ordor, I had to wash it before wrapping. Definitely wont be ordering again from this seller.',
  'review_headline': 'Not happy',
  'product_title': 'Juniors: Batman - Action Duo Juniors (Slim) T-Shirt Size M',
  'star_rating': 1,
  'helpful_votes': 0,
  'total_votes': 0,
  'verified_purchase': 'Y'}}