#Fine-tuning T5 from the Huggingface Library Simple Transformers

To do:
1. Update examples of interrogatory text

2. It seems like fake reviews
 like to "paint a picture" of the surroundingn, as opposed to concentrating on the actual thing being reviewed.
 the types of details they publish are too specific, and are narrative, as opposed to analytical. 


The primary internet resources for 


*   Fine Tuning: https://simpletransformers.ai/docs/usage/
*   Batch Size: https://machinelearningmastery.com/how-to-control-the-speed-and-stability-of-training-neural-networks-with-gradient-descent-batch-size/


*   General: https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html
*   Use: https://paperswithcode.com/method/t5


*   PyPi Example: https://pypi.org/project/simpletransformers/0.51.0/







In [None]:
!pip install simpletransformers
import pandas as pd
from simpletransformers.t5 import T5Model
from pprint import pprint
import logging
# Making sure the environment is set up correctly for anyone running this notebook
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import datetime as datetime
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import sklearn
from google.colab import drive
from tensorflow import keras
import re

In [None]:
# Settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

**Getting the Data**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
def download_and_load_dataset(force_download=True):
  return pd.read_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_processedUTF8.csv", encoding = 'UTF-8')


**Processing the data**

Purpose:
1. T5 trainer and evaluator take in panda dataframes with three columns: 

*   prefix: A string indicating the task to perform,
*   input_text: The input text sequence,
*   target_text: The target sequence.

We process our data to be in this form. The prefix value specifies the task we want the T5 model to perform. In our case, we use the prefix binary classification, since our objective is to classify a review as either real (0) or fake (1).

Output:
1.   Yelp reviews dataset for training, and generating metrics for the trained model
2.    Hotels OPSPAM reviews dataset for evaluating the generalizability of the trained model



In [None]:
#############################################################################
################################ Yelp dataset################################
#############################################################################
reviews = download_and_load_dataset()
reviews = reviews[['reviewText', 'fakeLabel']]
def refinereviewText(row):
    return row['reviewText'].lower()

def refinefakeLabel(row):
    if row['fakeLabel'] == -1:
        return 0
    else:
        return 1

reviews = reviews.dropna()
reviews['reviewText'] = reviews.apply(refinereviewText, axis=1)
reviews['fakeLabel'] = reviews.apply(refinefakeLabel, axis=1)

reals = df_zeros = reviews[reviews['fakeLabel'] == 0]
fakes = df_ones = reviews[reviews['fakeLabel'] == 1]
df_zeros = reviews[reviews['fakeLabel'] == 0].sample(80466) #make divisible by 32? not necessary. model takes into account imperfect divisibility
df_ones = reviews[reviews['fakeLabel'] == 1].sample(80466) #.sample(80466)
df_combined = df_zeros.append(df_ones)
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_smaller = df_combined.sample(frac=0.1).reset_index(drop=True)
df_smaller

reviews = df_smaller.copy()
reviews = reviews.rename(columns={"reviewText": "review", "fakeLabel": "deceptive"})
reviews.deceptive = reviews.deceptive.astype(str)

In [None]:
# Describe reviews
reviews.describe()

# Setting for pd
#pd.set_option('display.max_colwidth', 10)

#shape
print(reviews.shape)

In [None]:
# Shuffle and split the data
cross_num = 4
splitter = StratifiedShuffleSplit(n_splits=5, random_state=910, test_size=0.2)
labels = [str(x) for x in reviews['deceptive']] #must change to string for T5, since it is a text-to-text model
train_indices, test_indices = [x for x in splitter.split(reviews['review'], labels)][cross_num]

training_X = np.array([reviews['review'][x] for x in train_indices])
training_y = np.array([labels[x] for x in train_indices])
test_X = np.array([reviews['review'][x] for x in test_indices])
test_y = np.array([labels[x] for x in test_indices])

# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ['0','1']

In [None]:
#Creating training and testing dataset. Format to input into the T5 model which requires a dataframe with three columns: input_text, target_text, and prefix.
yelp_train = pd.DataFrame(zip(training_X, training_y), columns=["input_text", "target_text"])
yelp_test = pd.DataFrame(zip(test_X, test_y), columns=["input_text", "target_text"])

yelp_train["prefix"] = "binary classification"
yelp_test["prefix"] = "binary classification"

yelp_test.describe()
yelp_train.describe()


In [None]:
yelp_train.to_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_processedUTF8_train.csv")
yelp_train.head(10)

In [None]:
#############################################################################
############################### Hotels dataset###############################
#############################################################################
import pandas as pd
def download_and_load_dataset(force_download=True):
  return pd.read_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opinion_spam_corpusUTF8.csv", encoding = 'UTF-8')
hotel_test = download_and_load_dataset()
hotel_test['reviewTest'] = hotel_test.apply(refinereviewText, axis=1)
hotel_test = hotel_test[['reviewText', 'fakeLabel']].astype(str)
hotel_test = hotel_test.rename(columns={"reviewText": "input_text", "fakeLabel": "target_text"})
hotel_test["prefix"] = "binary classification"

hotel_test = hotel_test.sample(frac=1)
print(hotel_test.target_text.head())
# Create dataset to input into the trained T5 modell 

print(hotel_test.describe())

In [None]:
hotel_prime = hotel_test.copy()
hotel_prime['input_text'] = hotel_prime['input_text'] 

**Train the model**

In [None]:
# Model arguments. Be careful to set arguments that allow the use of custom metrics.  
# https://simpletransformers.ai/docs/t5-model/
# We use convention-based defaults for the training batch size, and the number of epochs
# Using a small # of epochs has a regularization effect (i.e. balances wanting to fit the training data well, and finding estimates
# for parameters that generalize well 

model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 10,
    "train_batch_size": 32, #convention is 32; small batch sizes are noisy and offer a regularizing effect; the # of observations in the training dataset must be divisible by the train_batch_size
    "num_train_epochs": 3,
    "save_eval_checkpoints": True, 
    "save_model_every_epoch": True,
    # "silent": True,
    "evaluate_generated_text": True,
    "evaluate_during_training": True, #[open] try setting to false for more epochs? for some reason, when i get to epoch 4 i get thrown an error -- dictionary given where numpy or tensor object expected
    "evaluate_during_training_verbose": True, #try setting to false for more epochs?
    #"num_workers": 1, doesn't fix memory problem...
}


In [None]:
# Custom metrics function

def metrics_fn(l, p):

    # Change into integer types 
    l_int = np.array(l).astype(int)
    p_int = np.array(p).astype(int)

    eval_accuracy=sklearn.metrics.accuracy_score(l, p)
    f1_score =sklearn.metrics.f1_score(l, p, labels=['0', '1'], pos_label = '1')
    auc = sklearn.metrics.roc_auc_score(l_int, p_int)
    precision = sklearn.metrics.precision_score(l, p, pos_label = '1')
    recall = sklearn.metrics.recall_score(l, p, labels=['0', '1'], pos_label = '1')
    cm = sklearn.metrics.confusion_matrix(l, p, labels=['0', '1'])
    return {'accuracy': eval_accuracy,
            'f1_score': f1_score,
            'precision': precision,
            'recall': recall,
            'auc': auc,
            'confusion matrix': cm}
    #return sum([1 if label == pred else 0 for label, pred in zip(labels, preds)])

In [None]:
'''
# Instantiate the model #Hard to get GPU on colab
model = T5Model("t5","t5-small", args=model_args, use_cuda = True)
# Train the model
# Common errors returned:
##                         mmap: cannnot allocate memory. In this case, "Restart runtime", and "Run all".
model.train_model(yelp_train, output_dir = "/content/drive/My Drive/6862_FakeReviewDetection/bestmodel", eval_data=yelp_test, metrics = metrics_fn)
'''

**Evaluate on Yelp testing dataset**

In [None]:
# Load model
model = T5Model("t5", "/content/drive/My Drive/6862_FakeReviewDetection/bestmodel/checkpoint-1209-epoch-3")

In [None]:
# Print out model metrics
result_yelp_train = model.eval_model(yelp_train, metrics = metrics_fn)

In [None]:
print(pd.DataFrame([result_yelp_train['metrics']]))

In [None]:
# Print out model metrics
result_yelp_test = model.eval_model(yelp_test, metrics = metrics_fn)
print(pd.DataFrame([result_yelp_test['metrics']]))

**Evaluate generalizability to the hotels dataset**


In [None]:
# Print out model metrics
# Load a fine-tuned T5 model. Specify the name of the model (here, it is t5), and the directory of the trained model
result_hotels= model.eval_model(hotel_test, metrics = metrics_fn)
# Export
pd.DataFrame([result_hotels['metrics']])

In [None]:
# Numbers about exaggerations. 
# Change existing reviews to change the probability of making it fake. 

**Understanding the model**

In [None]:
df = pd.read_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/eval_dfUTF8.csv", encoding = 'UTF-8').astype(str)

preds = model.predict(["binary classification: " + description for description in df["input_text"]])

with open("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/generated_classification.csv", "w") as f:
    for i, desc in enumerate(df["input_text"].tolist()):
        f.write(str(desc) + "\t")
        f.write(df['type'][i] + "\t")
        f.write(df['version'][i] + "\t")
        if preds[i]=="1":
          f.write("Fake \n")
        else:
          f.write("Real \n")
