In [None]:
#!pip install spacy

In [None]:
#!python -m spacy download en_core_web_sm

In [None]:
import warnings
warnings.filterwarnings('ignore')

# From Example Code  https://github.com/Swathiu/Detecting-Fake-Reviews/blob/master/Deception_Detection.py
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re

from datetime import datetime
from time import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, pairwise_distances
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import spacy

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Read in Data

In [None]:
overall_start_time = time()

file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/DS3 Fake Amazon Reviews/Dataset/"

category = 'Electronics' # Wireless, Tools, Software, Personal_Care_Appliances, Major_Appliances, Mobile_Apps, Mobile_Electronics, PC, Electronics, Automotive, Apparel, Beauty, Office_Products, Outdoors
file_name = f'amazon_reviews_us_{category}_v1_00.tsv.gz'

# Read in the specified number of rows
n_rows_to_read_in = 500_000
data = pd.read_csv(file_path + file_name, compression='gzip', header=0, sep='\t', quotechar='"', error_bad_lines=False, warn_bad_lines=False, nrows=n_rows_to_read_in)

# Read in ALL of the rows
# data = pd.read_csv(file_path + file_name, compression='gzip', header=0, sep='\t', quotechar='"', error_bad_lines=False, warn_bad_lines=False)

print(f"The 'data' file has {data.shape[0]} rows and {data.shape[1]} columns")
data.head(3)

In [None]:
data_small = data[['verified_purchase', 'review_body']]
data_small.head()

## Create Balanced Dataset
- have same number of rows of verified and unverified reviews

In [None]:
def under_sampling(df):
    print("Under-Sampling Data")
    # Count of Reviews
    print("Verified:", sum(df['verified_purchase'] == 'Y'))
    print("Un-Verified:", sum(df['verified_purchase'] == 'N'))

    sample_size = sum(df['verified_purchase'] == 'N')

    authentic_reviews_df = df[df['verified_purchase'] == 'Y']
    fake_reviews_df = df[df['verified_purchase'] == 'N']

    authentic_reviews_us_df = authentic_reviews_df.sample(sample_size)
    under_sampled_df = pd.concat([authentic_reviews_us_df, fake_reviews_df], axis=0)

    print("Under-Sampled Verified", sum(under_sampled_df['verified_purchase'] == 'Y'))
    print("Under-Sampled Un-Verified", sum(under_sampled_df['verified_purchase'] == 'N'))
    

    # Graph of Data Distribution
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.countplot(x='verified_purchase', data=under_sampled_df)
    plt.title("Count of Reviews")
    plt.show()
    print("Under-Sampling Complete")
    return under_sampled_df

In [None]:
data_equal_weight = under_sampling(data_small)
# data_equal_weight

# Data Cleaning

In [None]:
# Pre-processing Text Reviews
def data_cleaning(df):
    # Removing emtpy cells
    df.dropna(inplace=True)
    df['review_body_cleaned'] = df['review_body'].copy()
    
    # Removing Unicode Chars (URL)
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda rev: re.sub(r"(\w+:\/\/\S+)|^rt|http.+?", "", rev))
        
    # Replace HTML keywords with blank space ("&quot;", "br", "&#34")
    remove_dict = {"<br /><br />": " ", "<br />": " ", "br ": "", "&quot;": " ", "&#34": " ",
                   "<BR>": " ", "_": ""}
    for key, val in remove_dict.items():
        df['review_body_cleaned'] = df['review_body_cleaned'].apply(
            lambda x: x.replace(key, val))
        
    print("\n######## Remove URL and HTML Keywords Complete ########")
    
    # Remove Punctuations and numbers
    tokenizer = RegexpTokenizer(r'\w+')
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda x: ' '.join([word for word in tokenizer.tokenize(x)]))
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df['review_body_cleaned'] = df['review_body_cleaned'].apply(
            lambda x: x.replace(key, val))
    
    print("\n######## Remove Punctuation and Numbers Complete ########")
    
    # Lowercase Words
    df['review_body_cleaned'] = df['review_body_cleaned'].str.lower()
    
    print("\n######## Lowercase Complete ########")

    # Remove Stop Words.
    stop = stopwords.words('english')
      
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda x: ' '.join([word for word in x.split() if word.strip() not in stop]))
    
    print("\n######## Remove Stop Words Complete ########")
    
    # Lemmatization using .lemma_
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    df['review_body_cleaned'] = df['review_body_cleaned'].apply(
        lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
    print("\n######## Data Cleaning Complete ########")
    
    return df

In [None]:
# Clean the dataset
data_cleaned = data_cleaning(data_equal_weight)
data_cleaned.head()

# Feature Engineering + Prepare Data for Machine Learning

## Add Bigrams

In [None]:
# https://stackoverflow.com/questions/48331315/how-to-extract-all-the-ngrams-from-a-text-dataframe-column-in-different-order-in

from collections import Counter
from nltk import ngrams
from itertools import chain

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

def add_bigram_column(df):
    copy = df.copy()
    copy['bigrams'] = copy['review_body_cleaned'].map(lambda x: find_ngrams(x.split(), 2))
    return copy
    
data_cleaned = add_bigram_column(data_cleaned)
data_cleaned.head()

## Let's call the bigrams in vertified reviews "gold_bigrams" and the bigrams in unverified reviews "fake_bigrams"

In [None]:
data_fake = data_cleaned[data_cleaned['verified_purchase'] == 'N']
data_gold = data_cleaned[data_cleaned['verified_purchase'] == 'Y']

data_fake.tail(1)

In [None]:
def create_gold_bigrams(df_cleaned):
    global df_gold, gold_bigrams, gold_bigram_counts
    df_gold = df_cleaned[df_cleaned['verified_purchase'] == 'Y']
    gold_bigrams = df_gold['bigrams'].tolist()
    gold_bigrams = list(chain(*gold_bigrams))
    gold_bigram_counts = Counter(gold_bigrams)

create_gold_bigrams(data_cleaned)
gold_bigram_counts.most_common(20)

In [None]:
def create_fake_bigrams(df_cleaned):
    global df_fake, fake_bigrams, fake_bigram_counts
    df_fake = df_cleaned[df_cleaned['verified_purchase'] == 'N']
    fake_bigrams = df_fake['bigrams'].tolist()
    fake_bigrams = list(chain(*fake_bigrams))
    fake_bigram_counts = Counter(fake_bigrams)
    
create_fake_bigrams(data_cleaned)
fake_bigram_counts.most_common(20)

In [None]:
bigram_file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/DS3 Fake Amazon Reviews/Bigrams/"

gold_bigram_counts_df = pd.DataFrame(gold_bigram_counts.most_common(), columns=['bigram','count'])
gold_bigram_file_name = f'{category} gold_bigrams.csv'
gold_bigram_counts_df.to_csv(bigram_file_path + gold_bigram_file_name)

fake_bigram_counts_df = pd.DataFrame(fake_bigram_counts.most_common(), columns=['bigram','count'])
fake_bigram_file_name = f'{category} fake_bigrams.csv'
fake_bigram_counts_df.to_csv(bigram_file_path + fake_bigram_file_name)

In [None]:
# def plot_counter_N_most_common(gold_or_fake, N):
#     counter = eval(f'{gold_or_fake}_bigram_counts')
#     d = {}
#     for tup in counter.most_common(N):
#         bigram = f"{tup[0][0]} {tup[0][1]}"
#         d[bigram] = tup[1]

#     df = pd.Series(data=d).sort_values(ascending=False).to_frame().reset_index().rename(columns={'index': 'bigram', 0: 'count'})
#     f, ax = plt.subplots(figsize=(8, N/4 + 1))
#     if gold_or_fake == 'gold':
#         palette = sns.color_palette('flare', n_colors=N)
#     else:
#         palette = sns.color_palette('crest', n_colors=N)
#     palette.reverse()
#     title = f'Most Popular Bigrams in {gold_or_fake.capitalize()} Reviews for the {category} Category'
#     sns.barplot(x='count', y='bigram', data=df, orient='h', palette=palette).set_title(title)
#     file_path = 'C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/FARS/Ester Tsai (Bigram and ML)/Images/'
#     plt.savefig(file_path + f'{title} (transparent).png', transparent=True)
#     plt.savefig(file_path + f'{title}.png')
    
# plot_counter_N_most_common("fake", 15)

In [None]:
# from wordcloud import WordCloud

# def wordcloud_N_most_common(gold_or_fake, N):
#     counter = eval(f'{gold_or_fake}_bigram_counts')
#     d = {}
#     for tup in counter.most_common(N):
#         bigram = f"{tup[0][0]} {tup[0][1]}"
#         d[bigram] = tup[1]
        
# #     mask = np.array(Image.open("../input/input-img/cloud.png"))

#     word_cloud = WordCloud(width=3000, height=1500, background_color='white') #, mask=mask
#     word_cloud.generate_from_frequencies(frequencies=d)

#     plt.figure(figsize=(14,7))
#     plt.tight_layout(pad=0)
#     plt.imshow(word_cloud, interpolation='bilinear')
#     plt.axis("off")
#     title = f'WORDCLOUD - Most Popular Bigrams in {gold_or_fake.capitalize()} Reviews for the {category} Category'
#     plt.title(title, fontsize=15)

#     file_path = 'C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/FARS/Ester Tsai (Bigram and ML)/Images/'
#     plt.savefig(file_path + f'{title} (transparent).png', bbox_inches='tight', transparent=True)
#     plt.savefig(file_path + f'{title}.png', bbox_inches='tight')
    
#     plt.show()
    
# wordcloud_N_most_common('fake', 140)

## Add Features

**count** = number of gold/fake_bigrams in a review

**percent** = number of gold/fake_bigrams as a percentage of total number of bigrams in a review.

**simple score** = sum of the gold/fake_bigrams' popularity scores (calculated using the bigram's count in the Counter)

**normalized score** = simple score / total bigram count

In [None]:
# data_cleaned[[('anywhere', 'find') in bigrams for bigrams in data_cleaned['bigrams']]]

In [None]:
def get_bigram_count_percent(bigrams, bigram_dict):
    if len(bigrams) == 0:
        return 0
    
    count = 0
    for bigram in bigrams:
        if bigram in bigram_dict.keys():
            count += 1
    return count / len(bigrams)

def get_bigram_normalized_score(bigrams, bigram_dict):
    if len(bigrams) == 0:
        return 0
    
    score = 0
    for bigram in bigrams:
        if bigram in bigram_dict.keys():
            score += bigram_dict[bigram]
    return score / len(bigrams)

def get_bigram_unique_count_percent(bigrams, bigram_dict, the_other_bigram_dict, unique_threshold=0):
    # Count the number of bigrams in a review that appear in bigram_dict but not the_other_bigram_dict. 
    # Can adjust unique_threshold so you can count also the bigrams that appear in ...
    # ...the_other_bigram_dict fewer than unique_threshold times
    
    if len(bigrams) == 0:
        return 0
    
    count = len(bigrams)
    for bigram in bigrams:
        if (bigram in the_other_bigram_dict.keys()):
            if (the_other_bigram_dict[bigram] > unique_threshold):
                count -= 1
    return count / len(bigrams)

# Add bigram_count
data_cleaned['bigram_count'] = data_cleaned['bigrams'].apply(lambda x: len(x))

# Add features based on gold or fake bigrams
fake_bigram_dict = dict(fake_bigram_counts)
fake_bigram_dict_filtered = dict((k, v) for k, v in fake_bigram_dict_all.items() if v >= 2)

gold_bigram_dict = dict(gold_bigram_counts)
gold_bigram_dict_filtered = dict((k, v) for k, v in gold_bigram_dict_all.items() if v >= 2)

for gold_or_fake in ['gold', 'fake']:

    exec(f"data_cleaned['{gold_or_fake}_bigram_percent'] = data_cleaned['bigrams'].apply(\
        lambda x: get_bigram_count_percent(x, {gold_or_fake}_bigram_dict_filtered))")

    exec(f"data_cleaned['{gold_or_fake}_bigram_unique_percent'] = data_cleaned['bigrams'].apply(\
        lambda x: get_bigram_unique_count_percent(x, {gold_or_fake}_bigram_dict, {gold_or_fake}_bigram_dict, 0))")

    exec(f"data_cleaned['{gold_or_fake}_bigram_normalized_score'] = data_cleaned['bigrams'].apply(\
        lambda x: get_bigram_normalized_score(x, {gold_or_fake}_bigram_dict))")

data_cleaned.tail(1)

In [None]:
data_cleaned[data_cleaned['verified_purchase'] == 'Y'].head(15)


# Use Machine Learning to Make Predictions for Verified VS. Unverified
LABELS: 
- 1 = verified review
- 0 = unverified review

In [None]:
def semi_supervised_learning(df, model, algorithm, threshold=0.8, iterations=40):
    df = df.copy()
    
    df_unlabled = df[['bigram_count', 'fake_bigram_percent', 'fake_bigram_normalized_score', 'fake_bigram_unique_percent',
             'gold_bigram_percent', 'gold_bigram_normalized_score', 'gold_bigram_unique_percent']]
#     df_unlabled = df[['bigram_count', 'fake_bigram_count', 'fake_bigram_percent', 'fake_bigram_simple_score', 'fake_bigram_normalized_score',
#              'gold_bigram_count', 'gold_bigram_percent', 'gold_bigram_simple_score', 'gold_bigram_normalized_score']]

    df['verified_purchase'] = df['verified_purchase'].apply(lambda x: 1 if x == 'Y' else 0)
    print("Training " + algorithm + " Model")
    labels = df['verified_purchase']
    
    train_data, test_data, train_label, test_label = train_test_split(df_unlabled, labels, test_size=0.25, random_state=42)

    test_data_copy = test_data.copy()
    test_label_copy = test_label.copy()
    
    all_labeled = False

    current_iteration = 0

    pbar = tqdm(total=iterations)

    while not all_labeled and (current_iteration < iterations):
        current_iteration += 1
        model.fit(train_data, train_label)

        probabilities = model.predict_proba(test_data)
        pseudo_labels = model.predict(test_data)

        indices = np.argwhere(probabilities > threshold)

        for item in indices:
            train_data.loc[test_data.index[item[0]]] = test_data.iloc[item[0]]
            train_label.loc[test_data.index[item[0]]] = pseudo_labels[item[0]]
        test_data.drop(test_data.index[indices[:, 0]], inplace=True)
        test_label.drop(test_label.index[indices[:, 0]], inplace=True)

        print("--" * 20)

        if len(test_data) == 0:
            print("Exiting loop")
            all_labeled = True
        pbar.update(1)
        
    pbar.close()
    predicted_labels = model.predict(test_data_copy)

    print(algorithm + ' Model Results')
    print('--' * 20)
    print('Accuracy Score : ' + str(accuracy_score(test_label_copy, predicted_labels)))
    print('Precision Score : ' + str(precision_score(test_label_copy, predicted_labels, pos_label=1)))
    print('Recall Score : ' + str(recall_score(test_label_copy, predicted_labels, pos_label=1)))
    print('F1 Score : ' + str(f1_score(test_label_copy, predicted_labels, pos_label=1)))
#     print('Confusion Matrix : \n' + str(confusion_matrix(test_label_copy, predicted_labels)))
    plot_confusion_matrix(test_label_copy, predicted_labels, classes=[1, 0],
                          title=algorithm + ' Confusion Matrix').show()
    
    return model


def plot_confusion_matrix(y_true, y_pred, classes, title=None, cmap=plt.cm.Blues):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes,
           yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()

    return plt

## RandomForestClassifier

In [None]:
start_time = time()
# rf = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=14, max_features='auto', n_estimators=500)
rf = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=14, max_features='auto', n_estimators=500)
rf_model = semi_supervised_learning(data_cleaned, model=rf, threshold=0.7, iterations=3, algorithm='Random Forest')
end_time = time()

print("Time taken : ", end_time - start_time)

In [None]:
data_cleaned[(data_cleaned['verified_purchase'] == 'N') & (data_cleaned['fake_bigram_normalized_score'] < data_cleaned['gold_bigram_normalized_score'])]

In [None]:
from joblib import dump, load
ML_model_file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/DS3 Fake Amazon Reviews/ML Models/"
ML_model_file_name = f'05_05_2022 rf_model {(category)}.joblib'
dump(rf_model, ML_model_file_path + ML_model_file_name) 

In [None]:
overall_end_time = time()
print(round(overall_end_time - overall_start_time), "seconds")

In [None]:
# rf_model = load(ML_model_file_path + ML_model_file_name) 

# NEXT STEP: Making Predictions for a User Input

In [None]:
category = 'Electronics'
ML_model_file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/FARS/Ester Tsai (Bigram and ML)/ML Models/"
ML_model_file_name = f'05_05_2022 rf_model {(category)}.joblib'
rf_model = load(ML_model_file_path + ML_model_file_name) 


bigram_file_path = "C:/Users/tsaie/OneDrive/Desktop/000 Resumes & Projects/# Projects/FARS/Ester Tsai (Bigram and ML)/Bigrams/"

gold_bigram_file_name = f'{(category)} gold_bigrams.csv'
gold_bigram_df = pd.read_csv(bigram_file_path + gold_bigram_file_name, index_col=0)
display(gold_bigram_df.head(3))

fake_bigram_file_name = f'{(category)} fake_bigrams.csv'
fake_bigram_df = pd.read_csv(bigram_file_path + fake_bigram_file_name, index_col=0)
display(fake_bigram_df.head(3))

## Try changing the review and see what the RandomForest model predicts!

In [None]:
review = "These are the best headphones I've bought in the past few years. Totally recommend!"

user_input_processed_df = clean_review_and_add_features(review, data_cleaned)
display(user_input_processed_df)

def prepare_df_for_prediction(processed_df):
    df = processed_df.copy()
    df = df[['fake_bigram_count', 'fake_bigram_percent', 'fake_bigram_simple_score', 'fake_bigram_normalized_score',
             'gold_bigram_count', 'gold_bigram_percent', 'gold_bigram_simple_score', 'gold_bigram_normalized_score']]
    return df

df_for_prediction = prepare_df_for_prediction(user_input_processed_df)
prediction, probabilities = rf_model.predict(df_for_prediction), rf_model.predict_proba(df_for_prediction)[0]

def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3) * 100, round(proba[1], 3) * 100]
    if prediction[0] == 1:
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]}% probability of being VERIFIED and {proba[0]}% probability of being UNVERIFIED')
    if prediction[0] == 0:
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]}% probability of being UNVERIFIED and {proba[1]}% probability of being VERIFIED')
        
interpret_prediction(review, prediction, probabilities)

# END