#Fine-tuning T5 from the Huggingface Library Simple Transformers

The primary internet resources for 


*   Fine Tuning: https://simpletransformers.ai/docs/usage/
*   Batch Size: https://machinelearningmastery.com/how-to-control-the-speed-and-stability-of-training-neural-networks-with-gradient-descent-batch-size/


*   General: https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html
*   Use: https://paperswithcode.com/method/t5


*   PyPi Example: https://pypi.org/project/simpletransformers/0.51.0/







In [None]:
%%capture
import pandas as pd
from pprint import pprint
import logging
# Making sure the environment is set up correctly for anyone running this notebook
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import datetime as datetime
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import sklearn
from google.colab import drive
from tensorflow import keras
import re

In [None]:
# Settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

**Getting the Data**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
def download_and_load_dataset(force_download=True):
  return pd.read_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_processedUTF8.csv", encoding = 'UTF-8')


**Processing the data**

Purpose:
1. T5 trainer and evaluator take in panda dataframes with three columns: 

*   prefix: A string indicating the task to perform,
*   input_text: The input text sequence,
*   target_text: The target sequence.

We process our data to be in this form. The prefix value specifies the task we want the T5 model to perform. In our case, we use the prefix binary classification, since our objective is to classify a review as either real (0) or fake (1).

Output:
1.   Yelp reviews dataset for training, and generating metrics for the trained model
2.    Hotels OPSPAM reviews dataset for evaluating the generalizability of the trained model



In [None]:
#############################################################################
################################ Yelp dataset################################
#############################################################################
reviews = download_and_load_dataset()
reviews = reviews[['reviewText', 'fakeLabel', 'date']]

In [None]:

def refinereviewText(row):
    return row['reviewText'].lower()

def refinefakeLabel(row):
    if row['fakeLabel'] == -1:
        return 0
    else:
        return 1

def getyear(row):
    date = str(row['date'])
    return int(date[-2:])

reviews = reviews.dropna()
reviews['reviewText'] = reviews.apply(refinereviewText, axis=1)
reviews['fakeLabel'] = reviews.apply(refinefakeLabel, axis=1)
reviews['year'] = reviews.apply(getyear, axis=1)


df_zeros = reviews[reviews['fakeLabel'] == 0].sample(80466) #make divisible by 32? not necessary. model takes into account imperfect divisibility
df_ones = reviews[reviews['fakeLabel'] == 1].sample(80466) #.sample(80466)
df_combined = df_zeros.append(df_ones)
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_smaller = df_combined.sample(frac=0.1).reset_index(drop=True)
df_smaller

reviews = df_smaller.copy()
reviews = reviews.rename(columns={"reviewText": "review", "fakeLabel": "deceptive"})
reviews.deceptive = reviews.deceptive.astype(str)

reviews.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/reviews_pickle.pkl')

**Base case for comparing with hotels**

In [None]:
yelp = reviews.copy()
yelp = yelp.rename(columns={"review": "input_text", "deceptive": "target_text" })
yelp = yelp[["input_text", "target_text"]]
yelp["prefix"] = "binary classification"
yelp.target_text = yelp.target_text.astype(str)
yelp.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp.pkl')

In [None]:
# Shuffle and split the data
cross_num = 4
splitter = StratifiedShuffleSplit(n_splits=5, random_state=910, test_size=0.2)
labels = [str(x) for x in reviews['deceptive']] #must change to string for T5, since it is a text-to-text model
train_indices, test_indices = [x for x in splitter.split(reviews['review'], labels)][cross_num]

training_X = np.array([reviews['review'][x] for x in train_indices])
training_y = np.array([labels[x] for x in train_indices])
test_X = np.array([reviews['review'][x] for x in test_indices])
test_y = np.array([labels[x] for x in test_indices])

# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ['0','1']

In [None]:
#Creating training and testing dataset. Format to input into the T5 model which requires a dataframe with three columns: input_text, target_text, and prefix.
yelp_train = pd.DataFrame(zip(training_X, training_y), columns=["input_text", "target_text"])
yelp_test = pd.DataFrame(zip(test_X, test_y), columns=["input_text", "target_text"])

yelp_train["prefix"] = "binary classification"
yelp_test["prefix"] = "binary classification"

yelp_test.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_test.pkl')
yelp_train.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_train.pkl')


**Hotels dataset**

In [None]:
#############################################################################
############################### Hotels dataset###############################
#############################################################################
import pandas as pd
def download_and_load_dataset(force_download=True):
  return pd.read_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opinion_spam_corpusUTF8.csv", encoding = 'UTF-8')
hotel_test = download_and_load_dataset()
hotel_test['reviewTest'] = hotel_test.apply(refinereviewText, axis=1)
hotel_test = hotel_test[['reviewText', 'fakeLabel']].astype(str)
hotel_test = hotel_test.rename(columns={"reviewText": "input_text", "fakeLabel": "target_text"})
hotel_test["prefix"] = "binary classification"

hotel_test.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/hotel_test.pkl')


In [None]:
# Hotel train test split, tho confusing will name as train_ii and test_ii
cross_num = 4

splitter = StratifiedShuffleSplit(n_splits=5, random_state=910, test_size=0.2)
labels = [str(x) for x in hotel_test['target_text']] #must change to string for T5, since it is a text-to-text model
train_indices, test_indices = [x for x in splitter.split(hotel_test['input_text'], labels)][cross_num]

training_X = np.array([hotel_test['input_text'][x] for x in train_indices])
training_y = np.array([labels[x] for x in train_indices])
test_X = np.array([hotel_test['input_text'][x] for x in test_indices])
test_y = np.array([labels[x] for x in test_indices])

# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ['0','1']

In [None]:
#Creating training and testing dataset. Format to input into the T5 model which requires a dataframe with three columns: input_text, target_text, and prefix.
hotel_trainii = pd.DataFrame(zip(training_X, training_y), columns=["input_text", "target_text"])
hotel_testii = pd.DataFrame(zip(test_X, test_y), columns=["input_text", "target_text"])

hotel_trainii["prefix"] = "binary classification"
hotel_testii["prefix"] = "binary classification"


In [None]:
import pandas as pd
hotel_testii.target_text = hotel_testii.target_text.astype(str)
hotel_testii.target_text = hotel_testii.target_text.astype(str)

hotel_testii.target_text = hotel_testii.target_text.astype(str)
hotel_testii.target_text = hotel_testii.target_text.astype(str)

hotel_testii.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/hotel_testii.pkl')
hotel_trainii.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/hotel_trainii.pkl')

hotel_trainii.head()

**Time shift**

In [None]:
# Create old versus new version
reviews_old = reviews[reviews['year'] < 14]
reviews_new = reviews[reviews['year'] >= 14]


In [None]:
reviews_old = reviews_old[['review', 'deceptive']]
reviews_new = reviews_new[['review', 'deceptive']]
reviews_old = reviews_old.sample(frac=1).reset_index(drop=True)
reviews_new = reviews_new.sample(frac=1).reset_index(drop=True)

In [None]:

# Genuine/Fake split in old and new datasets
print("% genuine in old and new sample, respectively:" , reviews_old.value_counts('deceptive')[0]/np.sum(reviews_old.value_counts('deceptive')), reviews_new.value_counts('deceptive')[0]/np.sum(reviews_new.value_counts('deceptive')))

In [None]:
# Counts in old and new datasets
print("Number of old reviews:", len(reviews_old))
print("Number of new reviews:", len(reviews_new))

In [None]:
# Shuffle and split the data
cross_num = 4
splitter = StratifiedShuffleSplit(n_splits=5, random_state=910, test_size=0.2)
labels = [str(x) for x in reviews_old['deceptive']] #must change to string for T5, since it is a text-to-text model
train_indices, test_indices = [x for x in splitter.split(reviews_old['review'], labels)][cross_num]

training_X = np.array([reviews_old['review'][x] for x in train_indices])
training_y = np.array([labels[x] for x in train_indices])
test_X = np.array([reviews_old['review'][x] for x in test_indices])
test_y = np.array([labels[x] for x in test_indices])

new_X = reviews_new['review']
new_y = reviews_new['deceptive']

# Label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ['0','1']

In [None]:
# Creating training and testing dataset. Format to input into the T5 model which requires a dataframe with three columns: input_text, target_text, and prefix.
yelp_old_train = pd.DataFrame(zip(training_X, training_y), columns=["input_text", "target_text"])
yelp_old_test = pd.DataFrame(zip(test_X, test_y), columns=["input_text", "target_text"])
yelp_new = pd.DataFrame(zip(new_X, new_y), columns=["input_text", "target_text"])

yelp_old_train["prefix"] = "binary classification"
yelp_old_test["prefix"] = "binary classification"
yelp_new["prefix"] = "binary classification"


In [None]:
# Pickle
yelp_old_train.to_pickle("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_old_train.pkl")
yelp_old_test.to_pickle("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_old_test.pkl")
yelp_new.to_pickle("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_new.pkl")

**Top three restaurants**

In [None]:
reviews_top3 = pd.read_csv("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelptop3_balancedUTF8.csv", encoding = 'UTF-8')

In [None]:
reviews_top3.head()

In [None]:
reviews_top3['prefix']= "binary classificatio"
reviews_top3 = reviews_top3.rename(columns = {"reviewText":"input_text", "fakeLabel_str": "target_text", "restaurantID": "rid"})

In [None]:
reviews_top3.head()

In [None]:
yelp_top = reviews_top3[reviews_top3['rid']==1][['input_text', "target_text", "prefix"]]
yelp_23 = reviews_top3[reviews_top3['rid']!=1][['input_text', "target_text", "prefix"]]


In [None]:
# Hotel train test split, tho confusing will name as train_ii and test_ii
cross_num = 4

splitter = StratifiedShuffleSplit(n_splits=5, random_state=910, test_size=0.2)
labels = [str(x) for x in yelp_top['target_text']] #must change to string for T5, since it is a text-to-text model
train_indices, test_indices = [x for x in splitter.split(yelp_top['input_text'], labels)][cross_num]

training_X = np.array([yelp_top['input_text'][x] for x in train_indices])
training_y = np.array([labels[x] for x in train_indices])
test_X = np.array([yelp_top['input_text'][x] for x in test_indices])
test_y = np.array([labels[x] for x in test_indices])

# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ['0','1']

#Creating training and testing dataset. Format to input into the T5 model which requires a dataframe with three columns: input_text, target_text, and prefix.
yelp_top_train = pd.DataFrame(zip(training_X, training_y), columns=["input_text", "target_text"])
yelp_top_test = pd.DataFrame(zip(test_X, test_y), columns=["input_text", "target_text"])

yelp_top_train["prefix"] = "binary classification"
yelp_top_test["prefix"] = "binary classification"


yelp_top_train.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_top_train.pkl')
yelp_top_test.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_top_test.pkl')
yelp_23.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_23.pkl')


In [None]:
# Pickle

yelp_top_train.to_pickle("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_top_train.pkl")
yelp_top_test.to_pickle("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_top_test.pkl")


yelp_23.to_pickle("/content/drive/My Drive/6862_FakeReviewDetection/data/raw/yelp_23.pkl")



**Opspam**

In [None]:
# Shuffle and split the data
cross_num = 4
splitter = StratifiedShuffleSplit(n_splits=5, random_state=910, test_size=0.2)

def shuffle_split(df):
  train_indices, test_indices = [x for x in splitter.split(df['input_text'], df['target_text'])][cross_num]
  
  training_X = np.array([df['input_text'][x] for x in train_indices])
  training_y = np.array([df['target_text'][x] for x in train_indices])

  test_X = np.array([df['input_text'][x] for x in test_indices])
  test_y = np.array([df['target_text'][x] for x in test_indices])

  train = pd.DataFrame(zip(training_X, training_y), columns=["input_text", "target_text"])
  test = pd.DataFrame(zip(test_X, test_y), columns=["input_text", "target_text"])

  train['prefix'] = "binary classification"
  test['prefix'] = "binary classification"
  return (train, test)

# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ['0','1']

In [None]:
opspam = pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/OpSpamSentimentReviews.pkl')
opspam.head()


opspam = opspam.rename(columns = {"fakeLabel": "target_text", "reviewText": "input_text"})
opspam.target_text = opspam.target_text.astype(str)
opspam["prefix"] = "binary classification"

opspam_pos = opspam[opspam['polarity']=="positive"][["target_text", "input_text", "prefix"]].reset_index(drop=True)
opspam_neg = opspam[opspam['polarity']!="positive"][["target_text", "input_text", "prefix"]].reset_index(drop=True)

print(opspam_neg)

In [None]:
#Create train and testing datasets using user defined fn (shuffle_split) and then concatenating a "prefix" column
opspam_pos_train, opspam_pos_test = shuffle_split(opspam_pos)
opspam_neg_train, opspam_neg_test = shuffle_split(opspam_neg)
print(opspam_pos_train, opspam_neg_train)

In [None]:

opspam_neg.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_neg.pkl')
opspam_pos.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_pos.pkl')

opspam_pos_train.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_pos_train.pkl')
opspam_pos_test.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_pos_test.pkl')

opspam_neg_train.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_neg_train.pkl')
opspam_neg_test.to_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_neg_test.pkl')