#Fine-tuning T5 from the Huggingface Library Simple Transformers

This notebook trains the T5 on OpSpam negative reviews and tests on OpSpam positive reviews.

In [None]:
%%capture
!pip install simpletransformers
import pandas as pd
from simpletransformers.t5 import T5Model
from pprint import pprint
import logging
# Making sure the environment is set up correctly for anyone running this notebook
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import datetime as datetime
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import sklearn
from google.colab import drive
from tensorflow import keras
import re

In [None]:
from google.colab import drive
drive.mount("/content/drive")

**Load Data**

In [None]:
opspam_neg=pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_neg.pkl')
opspam_pos=pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_pos.pkl')

opspam_pos_train=pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_pos_train.pkl')
opspam_pos_test=pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_pos_test.pkl')

opspam_neg_train=pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_neg_train.pkl')
opspam_neg_finetune=pd.read_pickle('/content/drive/My Drive/6862_FakeReviewDetection/data/raw/opspam_neg_test.pkl')

**Train the model**

In [None]:
# Model arguments. Be careful to set arguments that allow the use of custom metrics.  
# https://simpletransformers.ai/docs/t5-model/
# We use convention-based defaults for the training batch size, and the number of epochs
# Using a small # of epochs has a regularization effect (i.e. balances wanting to fit the training data well, and finding estimates
# for parameters that generalize well 

model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 320,
    "train_batch_size": 32, #convention is 32; small batch sizes are noisy and offer a regularizing effect; the # of observations in the training dataset must be divisible by the train_batch_size
    "num_train_epochs": 3,
    "save_eval_checkpoints": True, 
    "save_model_every_epoch": True,
    # "silent": True,
    "evaluate_generated_text": True,
    "evaluate_during_training": True,
    "evaluate_during_training_verbose": True, 
}


In [None]:
# Custom metrics function

def metrics_fn(l, p):
    # Change into integer types 
    l_int = np.array(l).astype(int)
    p_int = np.array(p).astype(int)

    eval_accuracy=sklearn.metrics.accuracy_score(l, p)
    f1_score =sklearn.metrics.f1_score(l, p, labels=['0', '1'], pos_label = '1')
    auc = sklearn.metrics.roc_auc_score(l_int, p_int)
    precision = sklearn.metrics.precision_score(l, p, pos_label = '1')
    recall = sklearn.metrics.recall_score(l, p, labels=['0', '1'], pos_label = '1')
    cm = sklearn.metrics.confusion_matrix(l, p, labels=['0', '1'])
    return {'accuracy': eval_accuracy,
            'f1_score': f1_score,
            'precision': precision,
            'recall': recall,
            'auc': auc,
            'confusion matrix': cm}
    #return sum([1 if label == pred else 0 for label, pred in zip(labels, preds)])

In [None]:

# Instantiate the model #Hard to get GPU on colab
model = T5Model("t5","t5-small", args=model_args, use_cuda = True)

train_data = opspam_neg_train
test_data = opspam_neg_test
new_data = opspam_pos
train_data.target_text = train_data.target_text.astype(str)
test_data.target_text = test_data.target_text.astype(str)
new_data.target_text =new_data.target_text.astype(str)

# Train the model
# Common errors returned:
##                         mmap: cannnot allocate memory. In this case, "Restart runtime", and "Run all".
model.train_model(train_data, output_dir = "/content/drive/My Drive/6862_FakeReviewDetection/bestmodel_neg", eval_data=test_data, metrics = metrics_fn)


**Evaluate**

In [None]:
# Load model
#model = T5Model("t5", "/content/drive/My Drive/6862_FakeReviewDetection/bestmodel/checkpoint-1209-epoch-3")

In [None]:
# Print out model metrics on training data
'''
result_train = model.eval_model(train_data, metrics = metrics_fn)
out = pd.DataFrame([result_train['metrics']])
out.to_csv("/content/drive/My Drive/6862_FakeReviewDetection/bestmodel_neg/m_training.csv")
print(out)
'''

In [None]:
# Print out model metrics on testing data
result_test = model.eval_model(test_data, metrics = metrics_fn)
out=pd.DataFrame([result_test['metrics']])
out.to_csv("/content/drive/My Drive/6862_FakeReviewDetection/bestmodel_neg/m_testing.csv")
print(out)

In [None]:
# Print out model metrics on testing data
result_new = model.eval_model(new_data, metrics = metrics_fn)
out=pd.DataFrame([result_new['metrics']])
out.to_csv("/content/drive/My Drive/6862_FakeReviewDetection/bestmodel_neg/m_new.csv")
print(out)