In [1]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# Personal_Care_Appliances

In [5]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv("C:/Users/19495/DS3ProjectFiles/amazon_reviews_us_Beauty_v1_00.tsv.gz", names =  columns, sep = '\t').iloc[1:,:]

In [6]:
elec_df = elec_df.sample(n = 1_000_000)

In [7]:
one_file = elec_df.copy()

## Data Sampling

In [8]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [9]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 826615
Number of unverified purchases: 173377
Number of verified purchases (balanced dataset): 173377
Number of unverified purchases (balanced dataset): 173377


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2392747,US,11863580,R1JQNN06UHOWMF,B00DVMYZX2,367142670,"White Beeswax Bees Wax Pastilles Beads Premium Prime Grade A 100% Pure 16 oz, 1 LB",Beauty,5,0,0,N,N,Five Stars,thanks,2014-09-17
1892320,US,41247587,R288YNF5VNK6CC,B005X2F7JY,367161615,SHANY Cosmetics Nail Lquer Set,Beauty,5,0,0,N,N,Five Stars,I love the nail polishes they are so pretty and affordable,2014-12-17


In [10]:
print("The Beauty sample will have size:", balanced_elec.shape)

The Beauty sample will have size: (346754, 15)


## Data Cleaning & Type Conversion

In [11]:
#one_file = balanced_elec.copy()

In [12]:
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826615
Number of unverified purchases (balanced dataset): 173377


In [13]:
# Star rating of the format '2015-04-02' exists! In addition, np.nan exists in the helpful and total votes
one_file['star_rating'].value_counts()

5             509320
5             126601
4             118715
1              73092
3              63766
2              42572
4              26128
1              16320
3              14113
2               9365
2015-06-02         1
2015-03-18         1
2015-08-16         1
2014-10-09         1
2015-03-31         1
2015-04-09         1
2015-04-14         1
Name: star_rating, dtype: int64

In [14]:
# Convert the type of relevant columns to int
one_file['star_rating'] = (one_file['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
one_file['star_rating'] = one_file['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
one_file['helpful_votes'] = one_file['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
one_file['total_votes'] = one_file['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))

In [15]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    # Convert the type of relevant columns to int
    df['star_rating'] = (df['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
    df['star_rating'] = df['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
    df['helpful_votes'] = df['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
    df['total_votes'] = df['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))
    
    return df

In [17]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
2392747,thanks,thanks
1892320,I love the nail polishes they are so pretty and affordable,love nail polish pretty affordable
1825907,This diffuser is AWESOME! It has worked perfectly. Would definitely recommend.,diffuser awesome worked perfectly would definitely recommend
552940,I was over enthusiastic about using it and rubed too hard on my skin. Ouch. And it took forever to remove my thick dark sexy leg hairs :-),enthusiastic using rubed hard skin ouch took forever remove thick dark sexy leg hair
4543972,"When I placed my order, I was a little anxious bc of all the good reviews and research I've done...i couldn't wait to use it and see the results...when I got the night cream and foam wash within a week I was very impress and gave 5 star rating to my seller...right away that night, I used it and kept my fingers crossed BC im tired of looking for products to help the crazy and over productive worst case of dark spots all over my oily young Asian acne skin..when I got up the next morning, for the first time, my boyfriend said\\"" your skin is so soft\\""... I Ran to the bathroom and no breakouts..but surprisingly the dark spots got one shade lighter..at this point my smile is from ear to ear..I know everybody is different and products work differently for everybody but I wanted to share this review BC It works for me and it may not work for the next person but don't feel discourage cause there is something out there for you..just keep looking and wear a lot of sun screen",placed order little anxious bc good review research ive donei wait use see resultswhen got night cream foam wash within week impress gave star rating sellerright away night used kept finger crossed bc im tired looking product help crazy productive worst case dark spot oily young asian acne skinwhen got next morning first time boyfriend said skin soft ran bathroom eakoutsbut surprisingly dark spot got one shade lighterat point smile ear eari know everybody different product work differently everybody wanted share review bc work may work next person feel discourage cause something youjust keep looking wear lot sun screen


In [18]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
2392747,Five Stars,five star
1892320,Five Stars,five star
1825907,Five Stars,five star
552940,I was over enthusiastic about using it and rubed too ...,enthusiastic using rubed
4543972,impress,impress


In [19]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
2392747,"White Beeswax Bees Wax Pastilles Beads Premium Prime Grade A 100% Pure 16 oz, 1 LB",white beeswax bee wax pastille bead premium prime grade pure oz lb
1892320,SHANY Cosmetics Nail Lquer Set,shany cosmetic nail lquer set
1825907,ZAQ Noor Multi Color Litemist Aromatherapy Essential Oil Diffuser,zaq noor multi color litemist aromatherapy essential oil diffuser
552940,"Soften Her - Soften Body Hair, Exfoliate, and Prevent In-grown Hairs",soften soften body hair exfoliate prevent ingrown hair
4543972,Pond's Flawless White Brightening Night Cream 50 grams,pond flawless white brightening night cream gram


In [20]:
print("Number of verified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826526
Number of unverified purchases (balanced dataset): 173363


## Vader Sentiment Analysis

In [21]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [22]:
one_file = cleaned2.copy()

In [23]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [24]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [49]:
#one_file.to_csv('beauty_data_cleaned.csv')

In [26]:
one_file.head()
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826526
Number of unverified purchases (balanced dataset): 173363


## Confusion Matrix

In [27]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [29]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [30]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

In [31]:
def helpful_prop(df):  
    vote_series = pd.Series(df['helpful_votes'] / df['total_votes'])
    # !! All nan values in votes_prop should be changed to zero: this means that 0/0 occurred !!
    vote_series = vote_series.fillna(0)
    return vote_series

In [32]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [33]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [34]:
one_file['help_prop'] = helpful_prop(one_file)
one_file['rev_bod_id'] = one_file['rev_dict'].apply(id_for_dictionary)
one_file['help_prop_id'] = one_file["help_prop"].apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,rev_dict,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
2392747,US,11863580,R1JQNN06UHOWMF,B00DVMYZX2,367142670,"White Beeswax Bees Wax Pastilles Beads Premium Prime Grade A 100% Pure 16 oz, 1 LB",Beauty,5,0,0,N,N,Five Stars,thanks,2014-09-17,thanks,five star,white beeswax bee wax pastille bead premium prime grade pure oz lb,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4404}",0.0,0.0,1.0,0.4404,0.0,0.0,0.0,1,-1
1892320,US,41247587,R288YNF5VNK6CC,B005X2F7JY,367161615,SHANY Cosmetics Nail Lquer Set,Beauty,5,0,0,N,N,Five Stars,I love the nail polishes they are so pretty and affordable,2014-12-17,love nail polish pretty affordable,five star,shany cosmetic nail lquer set,"{'neg': 0.0, 'neu': 0.288, 'pos': 0.712, 'compound': 0.8126}",0.0,0.288,0.712,0.8126,0.0,0.0,0.0,1,-1
1825907,US,45633542,R33VSPETWJ8G4U,B008RP6GGC,333805416,ZAQ Noor Multi Color Litemist Aromatherapy Essential Oil Diffuser,Beauty,5,0,0,N,Y,Five Stars,This diffuser is AWESOME! It has worked perfectly. Would definitely recommend.,2014-12-28,diffuser awesome worked perfectly would definitely recommend,five star,zaq noor multi color litemist aromatherapy essential oil diffuser,"{'neg': 0.0, 'neu': 0.182, 'pos': 0.818, 'compound': 0.926}",0.0,0.182,0.818,0.926,0.0,0.0,0.0,1,-1
552940,US,13751569,R1RBR7HLQ83WDH,B00KQTDLGK,750897889,"Soften Her - Soften Body Hair, Exfoliate, and Prevent In-grown Hairs",Beauty,3,2,2,N,Y,I was over enthusiastic about using it and rubed too ...,I was over enthusiastic about using it and rubed too hard on my skin. Ouch. And it took forever to remove my thick dark sexy leg hairs :-),2015-06-17,enthusiastic using rubed hard skin ouch took forever remove thick dark sexy leg hair,enthusiastic using rubed,soften soften body hair exfoliate prevent ingrown hair,"{'neg': 0.074, 'neu': 0.579, 'pos': 0.347, 'compound': 0.7351}",0.074,0.579,0.347,0.7351,0.0258,0.4939,1.0,0,1
4543972,US,23873619,RZPMJ3B8G65NZ,B003OBE530,965340858,Pond's Flawless White Brightening Night Cream 50 grams,Beauty,5,7,10,N,Y,impress,"When I placed my order, I was a little anxious bc of all the good reviews and research I've done...i couldn't wait to use it and see the results...when I got the night cream and foam wash within a week I was very impress and gave 5 star rating to my seller...right away that night, I used it and kept my fingers crossed BC im tired of looking for products to help the crazy and over productive worst case of dark spots all over my oily young Asian acne skin..when I got up the next morning, for the first time, my boyfriend said\\"" your skin is so soft\\""... I Ran to the bathroom and no breakouts..but surprisingly the dark spots got one shade lighter..at this point my smile is from ear to ear..I know everybody is different and products work differently for everybody but I wanted to share this review BC It works for me and it may not work for the next person but don't feel discourage cause there is something out there for you..just keep looking and wear a lot of sun screen",2012-06-23,placed order little anxious bc good review research ive donei wait use see resultswhen got night cream foam wash within week impress gave star rating sellerright away night used kept finger crossed bc im tired looking product help crazy productive worst case dark spot oily young asian acne skinwhen got next morning first time boyfriend said skin soft ran bathroom eakoutsbut surprisingly dark spot got one shade lighterat point smile ear eari know everybody different product work differently everybody wanted share review bc work may work next person feel discourage cause something youjust keep looking wear lot sun screen,impress,pond flawless white brightening night cream gram,"{'neg': 0.119, 'neu': 0.752, 'pos': 0.129, 'compound': 0.0591}",0.119,0.752,0.129,0.0591,0.7783,0.4404,0.7,0,1


In [35]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating            int64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [36]:
imp_col = imp_col.dropna()

In [37]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [38]:
X

array([[ 0.    ,  5.    ,  0.    ,  1.    , -1.    ],
       [ 0.    ,  5.    ,  0.    ,  1.    , -1.    ],
       [ 0.    ,  5.    ,  0.    ,  1.    , -1.    ],
       ...,
       [ 0.2023,  5.    ,  0.    ,  1.    , -1.    ],
       [ 0.    ,  5.    ,  0.6249,  0.    , -1.    ],
       [ 0.    ,  5.    ,  0.    ,  1.    , -1.    ]])

In [39]:
print("Number of verified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826526
Number of unverified purchases (balanced dataset): 173363


In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [41]:
# Next, we are doing feature scaling to the training and test set of independent variables for reducing the size to smaller values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [43]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=1,
                     weights='uniform')

In [44]:
y_pred = knn_classifier.predict(X_test)

In [45]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [46]:
print(cm)
print(ac)

[[   775  34079]
 [  1465 163659]]
0.8222604486493514


In [None]:

#Can see the model performance and add more features accordingly -- 
#would be good if the performance is greater than 85%

### Test on a product review - Need to write a *function* for this for taking in the user input

The features that we are looking at!
* 'prod_title_comp', 
* 'star_rating', 
* 'rev_title_comp', 
* 'rev_bod_id', 
* 'help_prop_id'

review_body = "this is a good review"
product_title = "Sony Headphones"
review_title = 'Love the product!'

star_rating = 5
helpful_votes = 1
total_votes = 1

test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

out['review_body'][0]

def get_sentiment_proportions(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    #print(f"{sentence} {str(snt)}")
    neg = snt['neg']
    neu = snt['neu']
    pos = snt['pos']
    #compound = snt['compound']
    return neg, neu, pos

neg, neu, pos = get_sentiment_proportions(out.get("new_review_body").iloc[0])

product_category = convert_to_id(product_category)
product_title = only_compound(product_title)
rev_title = only_compound(review_title)

rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
help_prop_id = id_for_prop(helpful_votes / total_votes)
prod_title_comp = only_compound(out['new_review_title'][0])
rev_title_comp = only_compound(out['new_product_title'][0])

Predicted: 'verified_purchase'

User Input: 'prod_title_comp', 'product_category_convert', 'star_rating', 'helpful_votes', 'total_votes', 'neg_prop', 'neu_prop', 'pos_prop'
- 8 fields

rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

prediction

probabilities

classifier?

def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3), round(proba[1], 3)]
    if prediction[0] == 'Y':
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]*100}% probability of being VERIFIED and {proba[0]*100}% probability of being UNVERIFIED')
    if prediction[0] == 'N':
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]*100}% probability of being UNVERIFIED and {proba[1]*100}% probability of being VERIFIED')
        
interpret_prediction(review_test, prediction, probabilities)

from joblib import dump, load

knn_classifier

name = 'knn_working_model.joblib'
path = 'KNNModelFiles/'
dump(knn_classifier, path+name)

knn_classifier = load(path+name)

