In [1]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
files = ['amazon_reviews_us_Electronics_v1_00.tsv', \
         'amazon_reviews_us_Gift_Card_v1_00.tsv', \
         'amazon_reviews_us_Major_Appliances_v1_00.tsv', \
         'amazon_reviews_us_Office_Products_v1_00.tsv', \
         'amazon_reviews_us_Shoes_v1_00.tsv', \
         'amazon_reviews_us_Toys_v1_00.tsv', \
         'amazon_reviews_us_Watches_v1_00.tsv']

In [5]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv('fars_data/' + files[3], names =  columns, sep = '\t').iloc[1:,:]

In [6]:
elec_df.shape

(1559427, 15)

In [7]:
elec_df = elec_df.sample(n = 1_000_000)

In [8]:
one_file = elec_df.copy()

In [9]:
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
53780,US,20658634,R2MXMLRI2E4XD8,B003QIMPV0,638568010,Hexomatic Ballpoint Silver Pen,Office Products,5,0.0,0.0,N,Y,Five Stars,Love the pen works great use it every day.,2015-08-17
497417,US,30807357,R2IUSSM1XJWKD5,B00112BS34,331943404,"Sentry Tilt-Display Desktop Calculator, Black (CA270)",Office Products,5,0.0,0.0,N,Y,As advertised,So far so good,2015-03-29
1366242,US,3124060,RDNIZWARDVHHS,B009MSFV0A,393196302,Generic ML-D101S Variation-,Office Products,1,0.0,0.0,N,Y,PREMIUM REPLACEMENT TONER CARTRIDGES MARKED A1-mlt-D101 do not work in samsung scx3405FW printers as Adv.,I bought two of these ink cartridges and they did not fit the Samsung scx3405fw. My machine kept saying they were incompatible w/ my machine. I finally got pissed off and went to office depot and bought one and it fit perfect and worked perfect. I would not buy this garbage again if they were 25 cents each. It did not work it my laser printer as the box even showed it would.,2014-05-25
278215,US,21963669,R17BVYKLV04QOT,B001B39PFG,643713377,VELCRO Brand - Adhesive - for Plastics 1 Oz Tube Adhesive,Office Products,5,0.0,0.0,N,N,It worked great!,"I used this glue to adhere velcro to a blood pressure cuff that I bought on ebay. It was a brand new cuff, just didn't have the velcro glued on sufficiently. It worked great!",2015-06-08
284378,US,4777894,R1Y8TL7KKS51K4,B0055T3HOW,904254287,"3dRose Mexicana colorful skulls woman women fine art mexico - Mouse Pad, 8 by 8 inches (mp_21205_1)",Office Products,5,0.0,0.0,N,Y,Five Stars,Love it! Fast shipping!,2015-06-06


## Data Sampling

In [10]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [11]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 904402
Number of unverified purchases: 95596
Number of verified purchases (balanced dataset): 95596
Number of unverified purchases (balanced dataset): 95596


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
278215,US,21963669,R17BVYKLV04QOT,B001B39PFG,643713377,VELCRO Brand - Adhesive - for Plastics 1 Oz Tube Adhesive,Office Products,5.0,0.0,0.0,N,N,It worked great!,"I used this glue to adhere velcro to a blood pressure cuff that I bought on ebay. It was a brand new cuff, just didn't have the velcro glued on sufficiently. It worked great!",2015-06-08
1097166,US,49536101,R2MK8PJSAD2FHM,B0002NYPGU,873001592,Reusable Rubber Finger Gloves(tm) for Durable and Versatile Finger Only Coverage ~ 12 Duet Mixed Finger Gloves(tm) Packet,Office Products,3.0,0.0,1.0,N,N,Sizes run small and will cut off your circulation if you are larger than a woman's size 10.,"These worked fine, but I wish they came in larger sizes. There are similar products, like finger cots that are just as sturdy, but fit better and cost less.",2014-09-25


In [15]:
print("The Office Products sample will have size:", balanced_elec.shape)

The Office Products sample will have size: (191192, 15)


## Data Cleaning & Type Conversion

In [16]:
#one_file = balanced_elec.copy()

In [17]:
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 904402
Number of unverified purchases (balanced dataset): 95596


In [18]:
# Star rating of the format '2015-04-02' exists! In addition, np.nan exists in the helpful and total votes
one_file['star_rating'].value_counts()

5             538138
4             122879
5             101794
1              83116
3.0            57961
2              38560
4              22862
1              16433
3              10958
2               7297
2015-06-05         1
Name: star_rating, dtype: int64

In [19]:
# Convert the type of relevant columns to int
one_file['star_rating'] = (one_file['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
one_file['star_rating'] = one_file['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
one_file['helpful_votes'] = one_file['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
one_file['total_votes'] = one_file['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))

In [20]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

In [22]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
53780,Love the pen works great use it every day.,love pen work great use every day
497417,So far so good,far good
1366242,I bought two of these ink cartridges and they did not fit the Samsung scx3405fw. My machine kept saying they were incompatible w/ my machine. I finally got pissed off and went to office depot and bought one and it fit perfect and worked perfect. I would not buy this garbage again if they were 25 cents each. It did not work it my laser printer as the box even showed it would.,bought two ink cartridge fit samsung scxfw machine kept saying incompatible w machine finally got pissed went office depot bought one fit perfect worked perfect would buy garbage cent each work laser printer box even showed would
278215,"I used this glue to adhere velcro to a blood pressure cuff that I bought on ebay. It was a brand new cuff, just didn't have the velcro glued on sufficiently. It worked great!",used glue adhere velcro blood pressure cuff bought ebay new cuff velcro glued sufficiently worked great
284378,Love it! Fast shipping!,love it fast shipping


In [23]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
53780,Five Stars,five star
497417,As advertised,advertised
1366242,PREMIUM REPLACEMENT TONER CARTRIDGES MARKED A1-mlt-D101 do not work in samsung scx3405FW printers as Adv.,premium replacement toner cartridge marked amltd work samsung scxfw printer adv
278215,It worked great!,worked great
284378,Five Stars,five star


In [24]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
53780,Hexomatic Ballpoint Silver Pen,hexomatic ballpoint silver pen
497417,"Sentry Tilt-Display Desktop Calculator, Black (CA270)",sentry tiltdisplay desktop calculator black ca
1366242,Generic ML-D101S Variation-,generic mlds variation
278215,VELCRO Brand - Adhesive - for Plastics 1 Oz Tube Adhesive,velcro brand adhesive plastic oz tube adhesive
284378,"3dRose Mexicana colorful skulls woman women fine art mexico - Mouse Pad, 8 by 8 inches (mp_21205_1)",drose mexicana colorful skull woman woman fine art mexico mouse pad inch mp


In [25]:
print("Number of verified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 904352
Number of unverified purchases (balanced dataset): 95585


## Vader Sentiment Analysis

In [26]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [27]:
one_file = cleaned2.copy()

In [28]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [29]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [30]:
one_file.head()
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 904352
Number of unverified purchases (balanced dataset): 95585


## Confusion Matrix

In [31]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [33]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [34]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

In [35]:
def helpful_prop(df):  
    vote_series = pd.Series(df['helpful_votes'] / df['total_votes'])
    # !! All nan values in votes_prop should be changed to zero: this means that 0/0 occurred !!
    vote_series = vote_series.fillna(0)
    return vote_series

In [36]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [37]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [38]:
one_file['help_prop'] = helpful_prop(one_file)
one_file['rev_bod_id'] = one_file['rev_dict'].apply(id_for_dictionary)
one_file['help_prop_id'] = one_file["help_prop"].apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,rev_dict,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
53780,US,20658634,R2MXMLRI2E4XD8,B003QIMPV0,638568010,Hexomatic Ballpoint Silver Pen,Office Products,5.0,0.0,0.0,N,Y,Five Stars,Love the pen works great use it every day.,2015-08-17,love pen work great use every day,five star,hexomatic ballpoint silver pen,"{'neg': 0.0, 'neu': 0.376, 'pos': 0.624, 'compound': 0.8519}",0.0,0.376,0.624,0.8519,0.0,0.0,0.0,1,-1
497417,US,30807357,R2IUSSM1XJWKD5,B00112BS34,331943404,"Sentry Tilt-Display Desktop Calculator, Black (CA270)",Office Products,5.0,0.0,0.0,N,Y,As advertised,So far so good,2015-03-29,far good,advertised,sentry tiltdisplay desktop calculator black ca,"{'neg': 0.0, 'neu': 0.256, 'pos': 0.744, 'compound': 0.4404}",0.0,0.256,0.744,0.4404,0.0,0.0,0.0,1,-1
1366242,US,3124060,RDNIZWARDVHHS,B009MSFV0A,393196302,Generic ML-D101S Variation-,Office Products,1.0,0.0,0.0,N,Y,PREMIUM REPLACEMENT TONER CARTRIDGES MARKED A1-mlt-D101 do not work in samsung scx3405FW printers as Adv.,I bought two of these ink cartridges and they did not fit the Samsung scx3405fw. My machine kept saying they were incompatible w/ my machine. I finally got pissed off and went to office depot and bought one and it fit perfect and worked perfect. I would not buy this garbage again if they were 25 cents each. It did not work it my laser printer as the box even showed it would.,2014-05-25,bought two ink cartridge fit samsung scxfw machine kept saying incompatible w machine finally got pissed went office depot bought one fit perfect worked perfect would buy garbage cent each work laser printer box even showed would,premium replacement toner cartridge marked amltd work samsung scxfw printer adv,generic mlds variation,"{'neg': 0.086, 'neu': 0.658, 'pos': 0.255, 'compound': 0.802}",0.086,0.658,0.255,0.802,0.0,0.0,0.0,0,-1
278215,US,21963669,R17BVYKLV04QOT,B001B39PFG,643713377,VELCRO Brand - Adhesive - for Plastics 1 Oz Tube Adhesive,Office Products,5.0,0.0,0.0,N,N,It worked great!,"I used this glue to adhere velcro to a blood pressure cuff that I bought on ebay. It was a brand new cuff, just didn't have the velcro glued on sufficiently. It worked great!",2015-06-08,used glue adhere velcro blood pressure cuff bought ebay new cuff velcro glued sufficiently worked great,worked great,velcro brand adhesive plastic oz tube adhesive,"{'neg': 0.108, 'neu': 0.69, 'pos': 0.202, 'compound': 0.4404}",0.108,0.69,0.202,0.4404,0.0,0.6249,0.0,0,-1
284378,US,4777894,R1Y8TL7KKS51K4,B0055T3HOW,904254287,"3dRose Mexicana colorful skulls woman women fine art mexico - Mouse Pad, 8 by 8 inches (mp_21205_1)",Office Products,5.0,0.0,0.0,N,Y,Five Stars,Love it! Fast shipping!,2015-06-06,love it fast shipping,five star,drose mexicana colorful skull woman woman fine art mexico mouse pad inch mp,"{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6369}",0.0,0.417,0.583,0.6369,0.2023,0.0,0.0,1,-1


In [39]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating          float64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [40]:
imp_col = imp_col.dropna()

In [41]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [42]:
X

array([[ 0.    ,  5.    ,  0.    ,  1.    , -1.    ],
       [ 0.    ,  5.    ,  0.    ,  1.    , -1.    ],
       [ 0.    ,  1.    ,  0.    ,  0.    , -1.    ],
       ...,
       [ 0.    ,  5.    ,  0.    ,  1.    , -1.    ],
       [ 0.    ,  1.    , -0.7783,  0.    , -1.    ],
       [ 0.    ,  5.    ,  0.    ,  1.    , -1.    ]])

In [43]:
print("Number of verified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 904352
Number of unverified purchases (balanced dataset): 95585


In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [45]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=20, p=1)

In [46]:
y_pred = knn_classifier.predict(X_test)

In [47]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [48]:
print(cm)
print(ac)

[[   186  18753]
 [   250 180799]]
0.9049792987579255


In [49]:
from joblib import dump, load

In [50]:
name = 'knn_office_products_updated_model.joblib'
dump(knn_classifier, name)

['knn_office_products_updated_model.joblib']

In [51]:
knn_classifier = load(name)

## Getting Reviews

In [135]:
# Getting the indices of the reviews that were predicted correctly
ind1 = list((y_test == y_pred) & (y_pred == 'N'))#.index(True)
# & (y_test == 'N')
indexes = [i for i, x in enumerate(ind1) if x]
indexes

[18,
 657,
 976,
 1072,
 2162,
 3252,
 3389,
 6148,
 6224,
 6940,
 8065,
 8555,
 8860,
 9043,
 9364,
 10224,
 11045,
 18546,
 19232,
 20796,
 22749,
 23500,
 23756,
 25355,
 27292,
 28081,
 28748,
 28800,
 30171,
 31487,
 33584,
 34580,
 35438,
 37187,
 38968,
 42304,
 42322,
 42506,
 43377,
 43576,
 44175,
 44236,
 44727,
 45375,
 46154,
 48404,
 48809,
 51108,
 51204,
 53140,
 53143,
 53327,
 57780,
 61182,
 62578,
 62581,
 63248,
 63980,
 64091,
 65000,
 65892,
 66423,
 66723,
 68294,
 69814,
 71468,
 73104,
 73713,
 75705,
 76152,
 76229,
 77748,
 79912,
 80331,
 82385,
 83741,
 84916,
 84968,
 87325,
 87836,
 90024,
 90387,
 90493,
 91753,
 91917,
 92065,
 92186,
 94013,
 94152,
 95038,
 95341,
 95723,
 96241,
 96951,
 99159,
 99519,
 100216,
 100234,
 100344,
 101705,
 102154,
 102190,
 102833,
 103348,
 105337,
 106916,
 107346,
 113145,
 115677,
 116225,
 116555,
 117179,
 118517,
 120240,
 120340,
 120396,
 120572,
 121281,
 122250,
 126091,
 126133,
 128457,
 130142,
 130862,

In [136]:
95596*2

191192

In [137]:
len(indexes)

186

In [148]:
import random
random_num = random.choice(indexes)
print(random_num)
ind1 = random_num
#1280
X_test[ind1]

71468


array([ 0.8885,  5.    ,  0.    ,  0.    , -1.    ])

In [149]:
test = list(X)
test;

In [150]:
test2 = (test == X_test[ind1]) 
test2

array([[False,  True,  True, False,  True],
       [False,  True,  True, False,  True],
       [False, False,  True,  True,  True],
       ...,
       [False,  True,  True, False,  True],
       [False, False, False,  True,  True],
       [False,  True,  True, False,  True]])

def which_switch(ind2):
    x = [ind2.index(i) for i in ind2 if i == True]
    return x
which_switch(ind2)

In [151]:
ind2 = list(np.all(test2, axis = 1))#.index(True)
ind2
n_lst = [i for i, x in enumerate(ind2) if x]
(n_lst)

[31132,
 68624,
 88379,
 104276,
 150664,
 204621,
 254053,
 335580,
 347948,
 353687,
 362349,
 366988,
 367038,
 384304,
 418690,
 438661,
 495244,
 520278,
 522860,
 551677,
 603839,
 615867,
 616006,
 710225,
 782536,
 785162,
 787943,
 819428,
 821672,
 888575,
 904152,
 925071,
 956819,
 964575,
 967223,
 974194,
 979097,
 980242,
 998943]

In [152]:
#23 Y; 88 Y; 
test_1 = one_file.iloc[819428].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]
test_1

Unnamed: 0,299453
review_body,This products are very bright and the chalk color stands out really well.
review_headline,Five Stars
product_title,"CHALK MARKERS, Best 8 Pack Set of Bold Vibrant Colors by SillySticks, Erasable from Glass, Plastic, Mirrors, Metal, Whiteboards, Chalkboards, Perfect for Teachers, Kids and Moms, Nontoxic and Odorless, Enhance Your Arts and Crafts Projects Now!"
star_rating,5.0
helpful_votes,0.0
total_votes,0.0
verified_purchase,N


In [144]:
test_2 = one_file.iloc[444650].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]
test_2

Unnamed: 0,133409
review_body,"Scissors are great. Easy to grasp and manipulate and can be used by right or left hand. Also, very small and light weight make convenient for travel."
review_headline,Five Stars
product_title,Scissors: Adaptive Loop Scissors for Easy Cutting from Cues4Clues | For both Righties and Lefties |Great for Children and Adults | Helps Build Confidence in Cutting | Improve Coordination and Hand Strength | Have Fun with Arts and Crafts
star_rating,5.0
helpful_votes,1.0
total_votes,1.0
verified_purchase,N


## Getting Predictions

In [332]:
print('''Disney Animators' Collection Elsa Doll - 16"''')

Disney Animators' Collection Elsa Doll - 16"


In [114]:
review_body = 'Arrived very quickly, well packaged. Followed the provided instructions for installation and the emailed instructions for reset. Ran the internal print test and font list and the image looks good. Happy so far.'
review_title = 'Great Initial Results'
product_title = 'EPS Replacement Brother TN450 Toner Cartridge, High Yield (2,600 Yield) - Black'
star_rating = 5
helpful_votes = 0
total_votes = 0

In [115]:
test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

Unnamed: 0,review_body,review_title,product_title
0,"Arrived very quickly, well packaged. Followed the provided instructions for installation and the emailed instructions for reset. Ran the internal print test and font list and the image looks good. Happy so far.",Great Initial Results,"EPS Replacement Brother TN450 Toner Cartridge, High Yield (2,600 Yield) - Black"


In [116]:
out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

Unnamed: 0,review_body,review_title,product_title,new_review_body,new_review_title,new_product_title
0,"Arrived very quickly, well packaged. Followed the provided instructions for installation and the emailed instructions for reset. Ran the internal print test and font list and the image looks good. Happy so far.",Great Initial Results,"EPS Replacement Brother TN450 Toner Cartridge, High Yield (2,600 Yield) - Black",arrived quickly well packaged followed provided instruction installation emailed instruction reset ran internal print test font list image look good happy far,great initial result,eps replacement brother tn toner cartridge high yield yield black


In [117]:
rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
prop = 1
help_prop_id = id_for_prop(prop)
prod_title_comp = only_compound(out['new_product_title'][0])
rev_title_comp = only_compound(out['new_review_title'][0])

In [118]:
# The one we had originally, and is producing the wrong order of proportions
rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

array([[0.    , 5.    , 0.6249, 0.    , 1.    ]])

In [119]:
# The right one !!
#rev_input_test = np.array([[rev_title_comp, star_rating, prod_title_comp, rev_bod_id, help_prop_id]])
#rev_input_test

In [120]:
prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

prediction

prediction[0]
#probabilities

'Y'

In [154]:
test_2.to_dict()

{133409: {'review_body': 'Scissors are great. Easy to grasp and manipulate and can be used by right or left hand. Also, very small and light weight make convenient for travel.',
  'review_headline': 'Five Stars',
  'product_title': 'Scissors: Adaptive Loop Scissors for Easy Cutting from Cues4Clues | For both Righties and Lefties |Great for Children and Adults | Helps Build Confidence in Cutting | Improve Coordination and Hand Strength | Have Fun with Arts and Crafts',
  'star_rating': 5.0,
  'helpful_votes': 1.0,
  'total_votes': 1.0,
  'verified_purchase': 'N'}}