In [None]:
from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re

from datetime import datetime
from time import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, pairwise_distances
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import spacy

In [None]:
category = 'Electronics'
ML_model_file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/DS3 Fake Amazon Reviews/ML Models/"
ML_model_file_name = f'05_05_2022 rf_model {(category)}.joblib'
rf_model = load(ML_model_file_path + ML_model_file_name) 


bigram_file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/DS3 Fake Amazon Reviews/Bigrams/"

gold_bigram_file_name = f'{(category)} gold_bigrams.csv'
gold_bigram_df = pd.read_csv(bigram_file_path + gold_bigram_file_name, index_col=0)
display(gold_bigram_df.head(3))

fake_bigram_file_name = f'{(category)} fake_bigrams.csv'
fake_bigram_df = pd.read_csv(bigram_file_path + fake_bigram_file_name, index_col=0)
display(fake_bigram_df.head(3))

In [None]:
def data_cleaning(df):
    # Removing emtpy cells
    df.dropna(inplace=True)
    df['review_body_cleaned'] = df['review_body'].copy()
    
    # Removing Unicode Chars (URL)
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda rev: re.sub(r"(\w+:\/\/\S+)|^rt|http.+?", "", rev))
        
    # Replace HTML keywords with blank space ("&quot;", "br", "&#34")
    remove_dict = {"<br /><br />": " ", "<br />": " ", "br ": "", "&quot;": " ", "&#34": " ",
                   "<BR>": " ", "_": ""}
    for key, val in remove_dict.items():
        df['review_body_cleaned'] = df['review_body_cleaned'].apply(
            lambda x: x.replace(key, val))
        
    print("\n######## Remove URL and HTML Keywords Complete ########")
    
    # Remove Punctuations and numbers
    tokenizer = RegexpTokenizer(r'\w+')
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda x: ' '.join([word for word in tokenizer.tokenize(x)]))
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df['review_body_cleaned'] = df['review_body_cleaned'].apply(
            lambda x: x.replace(key, val))
    
    print("\n######## Remove Punctuation and Numbers Complete ########")
    
    # Lowercase Words
    df['review_body_cleaned'] = df['review_body_cleaned'].str.lower()
    
    print("\n######## Lowercase Complete ########")

    # Remove Stop Words.
    stop = stopwords.words('english')
      
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda x: ' '.join([word for word in x.split() if word.strip() not in stop]))
    
    print("\n######## Remove Stop Words Complete ########")
    
    # Lemmatization using .lemma_
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
    print("\n######## Data Cleaning Complete ########")
    
    return df



from collections import Counter
from nltk import ngrams
from itertools import chain

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

def add_bigram_column(df):
    copy = df.copy()
    copy['bigrams'] = copy['review_body_cleaned'].map(lambda x: find_ngrams(x.split(), 2))
    return copy




def get_bigram_count_percent(bigrams, bigram_dict):
    if len(bigrams) == 0:
        return 0
    
    count = 0
    for bigram in bigrams:
        if bigram in bigram_dict.keys():
            count += 1
    return count / len(bigrams)

def get_bigram_normalized_score(bigrams, bigram_dict):
    if len(bigrams) == 0:
        return 0
    
    score = 0
    for bigram in bigrams:
        if bigram in bigram_dict.keys():
            score += bigram_dict[bigram]
    return score / len(bigrams)

def get_bigram_unique_count_percent(bigrams, bigram_dict, the_other_bigram_dict, unique_threshold=0):
    # Count the number of bigrams in a review that appear in bigram_dict but not the_other_bigram_dict. 
    # Can adjust unique_threshold so you can count also the bigrams that appear in ...
    # ...the_other_bigram_dict fewer than unique_threshold times
    
    if len(bigrams) == 0:
        return 0
    
    count = len(bigrams)
    for bigram in bigrams:
        if bigram in the_other_bigram_dict.keys():
            if the_other_bigram_dict[bigram] > unique_threshold:
                count -= 1
    return count / len(bigrams)




def gold_bigram_df_to_vars(gold_bigram_df):
    global gold_bigrams, gold_bigram_dict, gold_bigram_dict_filtered
    gold_bigrams = gold_bigram_df['bigram'].to_list()
    str_to_tuple = lambda x: (x.split("'")[1], x.split("'")[3])
    gold_bigrams = list(map(str_to_tuple, gold_bigrams))
    gold_bigram_dict = {gold_bigrams[i]: gold_bigram_df['count'].iloc[i] for i in range(len(gold_bigrams))}
    gold_bigram_dict_filtered = dict((k, v) for k, v in gold_bigram_dict.items() if v >= 2)
    
def fake_bigram_df_to_vars(fake_bigram_df):
    global fake_bigrams, fake_bigram_dict, fake_bigram_dict_filtered
    fake_bigrams = fake_bigram_df['bigram'].to_list()
    str_to_tuple = lambda x: (x.split("'")[1], x.split("'")[3])
    fake_bigrams = list(map(str_to_tuple, fake_bigrams))
    fake_bigram_dict = {fake_bigrams[i]: fake_bigram_df['count'].iloc[i] for i in range(len(fake_bigrams))}
    fake_bigram_dict_filtered = dict((k, v) for k, v in fake_bigram_dict.items() if v >= 2)
    
    
    
    
def clean_review_and_add_features(review, gold_bigram_df, fake_bigram_df):
    # Create dataframe for the singular review
    df = pd.DataFrame(data={'review_body': review}, index=[0])
    
    # Generate gold/{gold_or_fake} bigrams in order to score the user input
    gold_bigram_df_to_vars(gold_bigram_df)
    fake_bigram_df_to_vars(fake_bigram_df)
    
    # Clean the user input
    df = data_cleaning(df) 
    df['bigrams'] = df['review_body_cleaned'].map(lambda x: find_ngrams(x.split(), 2))
    df['bigram_count'] = df['bigrams'].apply(lambda x: len(x))
    
    for gold_or_fake in ['gold', 'fake']:

        exec(f"df['{gold_or_fake}_bigram_percent'] = df['bigrams'].apply(\
            lambda x: get_bigram_count_percent(x, {gold_or_fake}_bigram_dict_filtered))")
        
        exec(f"df['{gold_or_fake}_bigram_unique_percent'] = df['bigrams'].apply(\
            lambda x: get_bigram_unique_count_percent(x, {gold_or_fake}_bigram_dict, {gold_or_fake}_bigram_dict, 0))")

        exec(f"df['{gold_or_fake}_bigram_normalized_score'] = df['bigrams'].apply(\
            lambda x: get_bigram_normalized_score(x, {gold_or_fake}_bigram_dict))")

    return df

## Try changing the review and see what the RandomForest model predicts!

In [None]:
review = "I bought this for my son. He loves it however the buttons do not work. He has to use the remote to control it. The music sounds great."

# Clean the user input and create a dataframe for it
user_input_processed_df = clean_review_and_add_features(review, gold_bigram_df, fake_bigram_df)
display(user_input_processed_df)

def prepare_df_for_prediction(processed_df):
    df = processed_df.copy()
    df = df[['bigram_count', 
             'fake_bigram_percent', 'fake_bigram_unique_percent', 'fake_bigram_normalized_score',
             'gold_bigram_percent', 'gold_bigram_unique_percent', 'gold_bigram_normalized_score']]
    return df

df_for_prediction = prepare_df_for_prediction(user_input_processed_df)
prediction, probabilities = rf_model.predict(df_for_prediction), rf_model.predict_proba(df_for_prediction)[0]

def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3) * 100, round(proba[1], 3) * 100]
    if prediction[0] == 1:
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]}% probability of being VERIFIED and {proba[0]}% probability of being UNVERIFIED')
    if prediction[0] == 0:
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]}% probability of being UNVERIFIED and {proba[1]}% probability of being VERIFIED')
        
interpret_prediction(review, prediction, probabilities)

# Gold & Fake Reviews from the Datasets

## Electronics

### Gold:

I bought this for my son. He loves it however the buttons do not work. He has to use the remote to control it. The music sounds great.

The lamp was easy to replace and the colors were vivids. Soooo worth it.!!

Bought it for my cousin. Good price.

Total dud. Dont waste your money. Connected it right out of the package and nothing. DOA on arrival. look else where.

Unit basically fell apart when we opened the package. Would not recommend this product.

It solved a big problem I had with multiple devices being plugged into power strips. I know have one outlet that I can plug in all my power cords into and 4 usb outlets to plug my usb charge cords into. I liked it so much I bought two more.

Worked as advertised. Thanks.

### Fake:

Bought these a year ago and being an active traveler they have held up pretty well. Comfortable for long flights and still have 100% noise cancellation

Enjoy crystal clear clarity sound with this product! Super great quality! Highly recommended! Would make a perfect gift for anyone or for yourself!!

Happy For about 4 weeks using these in the gym. The tips then lost the sponginess and then teared over time. I kept my headphones in a case and yet they wear out when you use them after a few weeks.

I bought this as a gift, and the recipient loves it.

Nearly impossible to set time and alarm. Not intuitive, and the buttons just don't work. Too bright. Basically junk.

Shockingly good speaker. It works very well depending on your device. For loud environments it works very well if it is next to you. I will give it a 8.5 for sound quality out of 10 and a 10 for build quality. 8.5 for sound because bose is the best sorry. If you never heard bose you will think this is the best speaker you ever heard. The bass is really good and detailed but not overly loud witch is really good. Good highs and mid levels the vocals are very detailed and clear. It is definitely worth the investment. You can't go wrong with this speaker