In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.6.0 (from simpletransformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
from simpletransformers.t5 import T5Model

In [3]:
## Input data paths

## base data
base_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/"
train_base_path = base_path + "df_train.csv"
val_base_path = base_path + "df_val.csv"
test_base_path = base_path + "df_test.csv"


## explanations
exp_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5-for-explanation/"
train_explanations_path =  exp_path + "df_train_pred_exp.csv"
val_explanations_path =  exp_path + "df_val_pred_exp.csv"
test_explanations_path =  exp_path + "df_test_pred_exp.csv"

## keywords
kws_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5-for-keywords/"
train_kw_path = kws_path + "df_train_pred_kw.csv"
val_kw_path = kws_path + "df_val_pred_kw.csv"
test_kw_path = kws_path + "df_test_pred_kw.csv"


## Read base data
df_train_base = pd.read_csv(train_base_path).astype(str)[['prefix','input_text','target_text']]
df_val_base = pd.read_csv(val_base_path).astype(str)[['prefix','input_text','target_text']]
df_test_base = pd.read_csv(test_base_path).astype(str)[['prefix','input_text','target_text']]
df_train_base = df_train_base[df_train_base['prefix']=='label'].copy()
df_val_base = df_val_base[df_val_base['prefix']=='label'].copy()
df_test_base = df_test_base[df_test_base['prefix']=='label'].copy()
df_train_base.rename(columns={'input_text':'sentence'}, inplace=True)
df_val_base.rename(columns={'input_text':'sentence'}, inplace=True)
df_test_base.rename(columns={'input_text':'sentence'}, inplace=True)


## Read explanations data
df_train_exp = pd.read_csv(train_explanations_path)
df_val_exp = pd.read_csv(val_explanations_path)
df_test_exp = pd.read_csv(test_explanations_path)
df_train_exp.rename(columns={'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)
df_val_exp.rename(columns={'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)
df_test_exp.rename(columns={'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)


## Read keywords data
df_train_kw = pd.read_csv(train_kw_path)
df_val_kw = pd.read_csv(val_kw_path)
df_test_kw = pd.read_csv(test_kw_path)
df_train_kw.rename(columns={'input_text':'sentence', 'predicted':'predicted_kw'}, inplace=True)
df_val_kw.rename(columns={'input_text':'sentence', 'predicted':'predicted_kw'}, inplace=True)
df_test_kw.rename(columns={'input_text':'sentence', 'predicted':'predicted_kw'}, inplace=True)


## Check shapes
print("Base data")
print("Train: ", df_train_base.shape)
print("Val: ", df_val_base.shape)
print("Test: ", df_test_base.shape)


print("Explanations: ")
print("Train: ", df_train_exp.shape)
print("Val: ", df_val_exp.shape)
print("Test: ", df_test_exp.shape)

print("Keywords: ")
print("Train: ", df_train_kw.shape)
print("Val: ", df_val_kw.shape)
print("Test: ", df_test_kw.shape)


Base data
Train:  (14072, 3)
Val:  (1787, 3)
Test:  (1761, 3)
Explanations: 
Train:  (14057, 5)
Val:  (1786, 5)
Test:  (1759, 5)
Keywords: 
Train:  (14072, 5)
Val:  (1787, 5)
Test:  (1761, 5)


In [4]:
## Output Paths
model_op_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_kw/"


In [5]:
## Merge data function to get modeling data
def get_model_data(df_label, df_exp, df_kw):
  df1 = df_label.merge(df_exp[['sentence','predicted_exp']], on = ['sentence'], how = 'inner')
  df = df1.merge(df_kw[['sentence','predicted_kw']], on = ['sentence'], how = 'inner')
  df = df.astype(str)
  kw_prefix = " The keywords in the sentence are: "
  df['input_text'] = df.apply(lambda row: row['sentence'] + ' ' + row['predicted_exp'] + kw_prefix + row['predicted_kw'], axis=1)
  df = df[['prefix','input_text','target_text']].copy()
  df.drop_duplicates(subset=['input_text'], keep='first', inplace=True)
  return df

train_df = get_model_data(df_train_base, df_train_exp, df_train_kw)
eval_df = get_model_data(df_val_base, df_val_exp, df_val_kw)
test_df = get_model_data(df_test_base, df_test_exp, df_test_kw)

print("Train shape: ", train_df.shape)
print("Val shape: ", eval_df.shape)
print("Test shape: ", test_df.shape)

Train shape:  (14060, 3)
Val shape:  (1786, 3)
Test shape:  (1759, 3)


In [6]:
## Store data
train_df.to_csv(model_op_path + "train_df.csv", index=False)
eval_df.to_csv(model_op_path + "eval_df.csv", index=False)
test_df.to_csv(model_op_path + "test_df.csv", index=False)

In [8]:
model_args_label = {
    "max_seq_length": 250,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "num_train_epochs": 5,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 15000,
    "evaluate_during_training_verbose": True,
    "use_multiprocessing": False,
    "fp16": False,
    "save_steps": -1,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "learning_rate":1e-4,
    "weight_decay":0.01
}

model = T5Model("t5", "t5-base", args=model_args_label)
model.train_model(train_df, eval_data=eval_df, output_dir=model_op_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/14060 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/1786 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/1786 [00:00<?, ?it/s]



Running Epoch 2 of 5:   0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/1786 [00:00<?, ?it/s]



Running Epoch 3 of 5:   0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/1786 [00:00<?, ?it/s]



Running Epoch 4 of 5:   0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/1786 [00:00<?, ?it/s]



(4395,
 {'global_step': [879, 1758, 2637, 3516, 4395],
  'eval_loss': [0.4968668632209301,
   0.3702148709978376,
   0.3913873807926263,
   0.4211116999254695,
   0.45436920165749534],
  'train_loss': [0.758188784122467,
   0.4011596441268921,
   0.448270320892334,
   0.22849391400814056,
   0.2877044975757599]})

In [9]:
model_args_test = {
    "overwrite_output_dir": True,
    "max_seq_length": 250,
    "eval_batch_size": 32,
    "use_multiprocessing": False,
    "num_beams": 3,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

# Load the trained model
model = T5Model("t5", "/content/outputs/best_model", args=model_args_test)

In [10]:
# Prepare the data for testing
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(test_df["prefix"].tolist(), test_df["input_text"].tolist())
]
truth = test_df["target_text"].tolist()
tasks = test_df["prefix"].tolist()

In [11]:
# Get the model predictions
preds = model.predict(to_predict)

## Taking only the top (first) predictions
top_preds = [pred[0] for pred in preds]
test_df["predicted"] = top_preds
test_df['all_preds'] = preds

## Saving predictions
test_df.to_csv(model_op_path + 'df_test_predictions.csv', index=False)

Generating outputs:   0%|          | 0/55 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/5277 [00:00<?, ?it/s]

In [12]:
test_df

Unnamed: 0,prefix,input_text,target_text,predicted,all_preds
0,label,Mary Mary Mary i know it god world trump is fa...,normal,normal,"[normal, normal, normal]"
1,label,in 2 there was 2 million white christian south...,hate,hate,"[hate, hate, hate]"
2,label,Albert there was no moslem ban this sentence c...,normal,normal,"[normal, normal, normal]"
3,label,spread you bitches like a butter knife this se...,offensive,offensive,"[offensive, offensive, offensive]"
4,label,i feel that i am a muslim and you whores need ...,offensive,hate,"[hate, hate, hate]"
...,...,...,...,...,...
1754,label,o brien said that the vast majority of the 9 i...,normal,offensive,"[offensive, offensive, offensive]"
1755,label,me quietly sitting in class with my friend and...,offensive,normal,"[normal, normal, normal]"
1756,label,Christopher Christopher Christopher no one wan...,normal,normal,"[normal, normal, normal]"
1757,label,teacher asks class where is pakistan little jo...,normal,normal,"[normal, normal, normal]"


In [13]:
acc = (test_df['predicted']==test_df['target_text']).sum()/test_df.shape[0]
acc

0.6787947697555429

In [14]:
!mv /content/outputs /content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_kw/