In [2]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [3]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [72]:
# Personal_Care_Appliances

In [73]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv("C:/Users/19495/DS3ProjectFiles/amazon_reviews_us_Electronics_v1_00.tsv.gz", names =  columns, sep = '\t').iloc[1:,:]

In [82]:
elec_df = elec_df.sample(n = 200000)

In [83]:
one_file = elec_df.copy()
print("The number of Personal Care Appliance reviews is much smaller than Electronics:", one_file.shape)

The number of Personal Care Appliance reviews is much smaller than Electronics: (200000, 15)


## Data Sampling

In [75]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [84]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 168000
Number of unverified purchases: 32000
Number of verified purchases (balanced dataset): 32000
Number of unverified purchases (balanced dataset): 32000


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2565477,US,19361542,R210AJI2MGVUA9,B004YI3LGY,55021312,Logitech Ultimate Ears 350 Noise-Isolating Earphones,Electronics,4,0,0,N,N,Mels thoughts,"[[ASIN:B004YI3LGY Logitech Ultimate Ears 350vi Noise-Isolating Headset (Silver)]] Being hard of hearing, I thought the sound was great. Unfortunately I do not have a smart phone and the buds wont fit my Kyocera Vera phone. They do fit my Panasonic portable CD player except the remote does not work but the remote that came with the player works. I wish the new buds did. Also one suggestionn, the remote for the new buds could be a about 4\\"" further away from the earphones so that you coudl see what you are adjusting. I still will use the new buds since I like the sound. Mel G",2011-07-12
2944114,US,51064147,R2JPC7W6AECE7Y,B000TMBOXG,992838678,Sungale/Alpha SW7A-072 7 Inch Digital Picture Frame with Speaker - Brown,Electronics,2,0,0,N,N,Not very good,"Remote control only works if you point it at a small hole on the lower left side of screen. (I also replaced the battery with a fresh one.) Per customer support, the best thing to do to make it work is to REMOVE the frame around the screen first, use the remote, then put the frame back! I don't think so. Bottom line: Would I buy this product again? Nope. I am returning it ASAP.",2007-12-14


In [86]:
print("The Personal Care Appliances sample will have size:", balanced_elec.shape)

The Personal Care Appliances sample will have size: (64000, 15)


## Data Cleaning & Type Conversion

In [87]:
# Convert the type of relevant columns to int
one_file['star_rating'] = one_file['star_rating'].apply(int)
one_file['helpful_votes'] = one_file['helpful_votes'].apply(int)
one_file['total_votes'] = one_file['total_votes'].apply(int)

In [79]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

In [88]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
1122710,Works perfectly connecting my Turtle Beach X41 headset to my Xbox One controller adapter.,work perfectly connecting turtle beach x headset xbox one controller adapter
857571,"I got two of these for my mom, who is not technically savvy. One for each of my parents' two homes. I initially showed her how to use it, and off she went! She was having fun with Pandora all day.<br /><br />My mom likes to listen to music when she cleans and tidys the house up. She likes that she can hear it in another room and the sound is still clear.<br /><br />She loves it! What a great value! She's thrilled with the product and I'm happy she's happy. It was money well spent.",got two mom technically savvy one parent two home initially showed use it went fun pandora day mom like listen music clean tidy house up like hear another room sound still clear love it great value thrilled product im happy happy money well spent
2565477,"[[ASIN:B004YI3LGY Logitech Ultimate Ears 350vi Noise-Isolating Headset (Silver)]] Being hard of hearing, I thought the sound was great. Unfortunately I do not have a smart phone and the buds wont fit my Kyocera Vera phone. They do fit my Panasonic portable CD player except the remote does not work but the remote that came with the player works. I wish the new buds did. Also one suggestionn, the remote for the new buds could be a about 4\\"" further away from the earphones so that you coudl see what you are adjusting. I still will use the new buds since I like the sound. Mel G",asinbyilgy logitech ultimate ear vi noiseisolating headset silver hard hearing thought sound great unfortunately smart phone bud wont fit kyocera vera phone fit panasonic portable cd player except remote work remote came player work wish new bud did also one suggestionn remote new bud could away earphone coudl see adjusting still use new bud since like sound mel g
1031804,Got this bright idea of separating CDs that I carry in my car into categories so need several cases.,got ight idea separating cd carry car category need several case
825042,"These are my favorite headphones. I've bought multiple sets for the car, work, exercise, home. Inexpensive, fit well and stay in place comfortably...and good sound!",favorite headphone ive bought multiple set car work exercise home inexpensive fit well stay place comfortablyand good sound


In [89]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
1122710,Awesome!,awesome
857571,Great value and perfect for novices!,great value perfect novice
2565477,Mels thoughts,mels thought
1031804,Blue CD case,blue cd case
825042,Low cost and relatively durable,low cost relatively durable


In [90]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
1122710,Steren 3 feet 2.5mm Male To 2.5mm Male - Stereo,steren foot mm male mm male stereo
857571,iHome Bluetooth Rechargeable Mini Speaker Cube - Green (iBT16QC),ihome bluetooth rechargeable mini speaker cube green ibtqc
2565477,Logitech Ultimate Ears 350 Noise-Isolating Earphones,logitech ultimate ear noiseisolating earphone
1031804,Case Logic EVA Molded CD/DVD Case,case logic eva molded cddvd case
825042,Philips SHS3200WT/37 Flexible Earhook Headphones White,philip shswt flexible earhook headphone white


## Vader Sentiment Analysis

In [91]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [92]:
one_file = cleaned2.copy()

In [93]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
sentiment_dicts = pd.Series(one_file['new_review_body'].apply(get_sentiment_scores))

one_file['neg_prop'] = sentiment_dicts.apply(lambda x: x['neg'])
one_file['neu_prop'] = sentiment_dicts.apply(lambda x: x['neu'])
one_file['pos_prop'] = sentiment_dicts.apply(lambda x: x['pos'])
one_file['compound_prop'] = sentiment_dicts.apply(lambda x: x['compound'])

In [94]:
one_file.to_csv('personal_care_appliances_data_cleaned.csv')

KeyboardInterrupt: 

## Confusion Matrix

In [43]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [44]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [95]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'neg_prop', 'neu_prop',
       'pos_prop', 'compound_prop'],
      dtype='object')

In [96]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)
one_file['help_prop'] = one_file.get("helpful_votes") / one_file.get("total_votes")

In [97]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [98]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [99]:
one_file['rev_bod_id'] = sentiment_dicts.apply(id_for_dictionary)
one_file['help_prop_id'] = one_file.get("help_prop").apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
1122710,US,32670331,R2XCHH5BG4HPDV,B00291F4RM,616126032,Steren 3 feet 2.5mm Male To 2.5mm Male - Stereo,Electronics,5,1,1,N,Y,Awesome!,Works perfectly connecting my Turtle Beach X41 headset to my Xbox One controller adapter.,2014-09-11,work perfectly connecting turtle beach x headset xbox one controller adapter,awesome,steren foot mm male mm male stereo,0.0,0.704,0.296,0.6369,0.0,0.6249,1.0,0,1
857571,US,11668644,R1TOP3JR17TU14,B00B9AB26G,959531361,iHome Bluetooth Rechargeable Mini Speaker Cube - Green (iBT16QC),Electronics,5,0,0,N,Y,Great value and perfect for novices!,"I got two of these for my mom, who is not technically savvy. One for each of my parents' two homes. I initially showed her how to use it, and off she went! She was having fun with Pandora all day.<br /><br />My mom likes to listen to music when she cleans and tidys the house up. She likes that she can hear it in another room and the sound is still clear.<br /><br />She loves it! What a great value! She's thrilled with the product and I'm happy she's happy. It was money well spent.",2014-12-18,got two mom technically savvy one parent two home initially showed use it went fun pandora day mom like listen music clean tidy house up like hear another room sound still clear love it great value thrilled product im happy happy money well spent,great value perfect novice,ihome bluetooth rechargeable mini speaker cube green ibtqc,0.0,0.466,0.534,0.9879,0.0,0.8807,,1,0
2565477,US,19361542,R210AJI2MGVUA9,B004YI3LGY,55021312,Logitech Ultimate Ears 350 Noise-Isolating Earphones,Electronics,4,0,0,N,N,Mels thoughts,"[[ASIN:B004YI3LGY Logitech Ultimate Ears 350vi Noise-Isolating Headset (Silver)]] Being hard of hearing, I thought the sound was great. Unfortunately I do not have a smart phone and the buds wont fit my Kyocera Vera phone. They do fit my Panasonic portable CD player except the remote does not work but the remote that came with the player works. I wish the new buds did. Also one suggestionn, the remote for the new buds could be a about 4\\"" further away from the earphones so that you coudl see what you are adjusting. I still will use the new buds since I like the sound. Mel G",2011-07-12,asinbyilgy logitech ultimate ear vi noiseisolating headset silver hard hearing thought sound great unfortunately smart phone bud wont fit kyocera vera phone fit panasonic portable cd player except remote work remote came player work wish new bud did also one suggestionn remote new bud could away earphone coudl see adjusting still use new bud since like sound mel g,mels thought,logitech ultimate ear noiseisolating earphone,0.083,0.714,0.203,0.8621,0.0,0.0,,0,0
1031804,US,18172969,RI7ZVWJEXV38S,B0015RB75Y,591072935,Case Logic EVA Molded CD/DVD Case,Electronics,4,0,0,N,Y,Blue CD case,Got this bright idea of separating CDs that I carry in my car into categories so need several cases.,2014-10-16,got ight idea separating cd carry car category need several case,blue cd case,case logic eva molded cddvd case,0.0,1.0,0.0,0.0,0.0,0.0,,0,0
825042,US,47163841,R1JCGL7B7VDPKQ,B003CJTR82,41165686,Philips SHS3200WT/37 Flexible Earhook Headphones White,Electronics,5,0,0,N,Y,Low cost and relatively durable,"These are my favorite headphones. I've bought multiple sets for the car, work, exercise, home. Inexpensive, fit well and stay in place comfortably...and good sound!",2014-12-28,favorite headphone ive bought multiple set car work exercise home inexpensive fit well stay place comfortablyand good sound,low cost relatively durable,philip shswt flexible earhook headphone white,0.0,0.571,0.429,0.8591,0.2263,-0.2732,,0,0


In [100]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating            int64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [101]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [102]:
X

array([[0.    , 5.    , 0.6249, 0.    , 1.    ],
       [0.    , 5.    , 0.8807, 1.    , 0.    ],
       [0.    , 4.    , 0.    , 0.    , 0.    ],
       ...,
       [0.4588, 5.    , 0.    , 1.    , 0.    ],
       [0.4404, 5.    , 0.6249, 1.    , 0.    ],
       [0.4019, 5.    , 0.    , 1.    , 0.    ]])

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [104]:
# Next, we are doing feature scaling to the training and test set of independent variables for reducing the size to smaller values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [105]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=1,
                     weights='uniform')

In [106]:
y_pred = knn_classifier.predict(X_test)

In [107]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [108]:
print(cm)
print(ac)

[[  245  6219]
 [  523 33010]]
0.8314373578018351


In [None]:

print(cm)

print(ac)

#Can see the model performance and add more features accordingly -- 
#would be good if the performance is greater than 85%

### Test on a product review - Need to write a *function* for this for taking in the user input

The features that we are looking at!
* 'prod_title_comp', 
* 'star_rating', 
* 'rev_title_comp', 
* 'rev_bod_id', 
* 'help_prop_id'

review_body = "this is a good review"
product_title = "Sony Headphones"
review_title = 'Love the product!'

star_rating = 5
helpful_votes = 1
total_votes = 1

test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

out['review_body'][0]

def get_sentiment_proportions(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    #print(f"{sentence} {str(snt)}")
    neg = snt['neg']
    neu = snt['neu']
    pos = snt['pos']
    #compound = snt['compound']
    return neg, neu, pos

neg, neu, pos = get_sentiment_proportions(out.get("new_review_body").iloc[0])

product_category = convert_to_id(product_category)
product_title = only_compound(product_title)
rev_title = only_compound(review_title)

rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
help_prop_id = id_for_prop(helpful_votes / total_votes)
prod_title_comp = only_compound(out['new_review_title'][0])
rev_title_comp = only_compound(out['new_product_title'][0])

Predicted: 'verified_purchase'

User Input: 'prod_title_comp', 'product_category_convert', 'star_rating', 'helpful_votes', 'total_votes', 'neg_prop', 'neu_prop', 'pos_prop'
- 8 fields

rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

prediction

probabilities

classifier?

def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3), round(proba[1], 3)]
    if prediction[0] == 'Y':
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]*100}% probability of being VERIFIED and {proba[0]*100}% probability of being UNVERIFIED')
    if prediction[0] == 'N':
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]*100}% probability of being UNVERIFIED and {proba[1]*100}% probability of being VERIFIED')
        
interpret_prediction(review_test, prediction, probabilities)

from joblib import dump, load

knn_classifier

name = 'knn_working_model.joblib'
path = 'KNNModelFiles/'
dump(knn_classifier, path+name)

knn_classifier = load(path+name)

