In [9]:
import torch
import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
# Confirm that the GPU is detected
torch.cuda.is_available()

True

In [4]:
# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 2


In [10]:
# Data used from the following source: https://aclanthology.org/2020.emnlp-main.473/ 

aave_csv = pd.read_csv("/kaggle/input/dialect-samples/aave_samples.csv",header=None)
sae_csv = pd.read_csv("/kaggle/input/dialect-samples/sae_samples.csv",header=None)

> The AAE to SAE translation samples dataset is obtained from the [Groenwold et al., EMNLP 2020](http://https://aclanthology.org/2020.emnlp-main.473/) paper.

In [11]:
aave_csv.head()

Unnamed: 0,0
0,Sooo Manti Te'o was having a online/phone rela...
1,this lil girl aint going to win im the king of...
2,He up stairs rights now and I'm down here gett...
3,Shit I Am Who Am..Fresh up out of Apologize..I...
4,It's very rare that I get what I want. Now tha...


In [12]:
sae_csv.head()

Unnamed: 0,0
0,Manti Te'o was having a relationship via telep...
1,The little girl is not going to win because i ...
2,He is upstairs rights now and I'm down here ge...
3,"Shit, I am who I am. I'm done apologizing. I'm..."
4,"It is very rare that I get what I want, but no..."


In [13]:
aave_csv.rename(columns = {0:'AAVE'},inplace = True)
sae_csv.rename(columns = {0:'SAE'},inplace = True)

In [14]:
merged_df = pd.concat([aave_csv, sae_csv], axis=1)
merged_df.head()

Unnamed: 0,AAVE,SAE
0,Sooo Manti Te'o was having a online/phone rela...,Manti Te'o was having a relationship via telep...
1,this lil girl aint going to win im the king of...,The little girl is not going to win because i ...
2,He up stairs rights now and I'm down here gett...,He is upstairs rights now and I'm down here ge...
3,Shit I Am Who Am..Fresh up out of Apologize..I...,"Shit, I am who I am. I'm done apologizing. I'm..."
4,It's very rare that I get what I want. Now tha...,"It is very rare that I get what I want, but no..."


In [15]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing
train_df, test_df = train_test_split(merged_df, test_size=0.1)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [16]:
len(train_dataset), len(test_dataset)

(1817, 202)

## Training MBart to perform AAE-to-SAE translation

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-en-ro')

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [18]:
def tokenize_function(examples):
    # Tokenize both the AAE (input) and SAE (target) sentences.
    model_inputs = tokenizer(examples["AAVE"], max_length=128, padding="max_length", truncation=True)

    # Tokenize the labels without using return_tensors="pt" to keep them as lists
    labels = tokenizer(examples["SAE"], max_length=128, padding="max_length", truncation=True)
    
    # Update model_inputs to include labels; ensure labels are lists
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [19]:
# Mapping the tokenization function over the datasets.
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
# Inspect the first few examples from the tokenized training dataset
for i in range(5):
    print("AAVE:", tokenizer.decode(tokenized_train_dataset[i]['input_ids'], skip_special_tokens=True))
    print("SAE:", tokenizer.decode(tokenized_train_dataset[i]['labels'], skip_special_tokens=True), '\n')

AAVE: 's message "I will give you rest" must of worked last night - found stress ball under a chair inthe worship center this morning.
SAE: the message \"I will give you rest\" must have worked last night. I found a stress ball under a chair in the worship center this morning 

AAVE: Whatup, I'm cool. Just been staying out the way. How youand that lil lady doing?
SAE: Hello, I'm doing well. I've just been behaving myself and staying out of trouble. How are youand your wife doing 

AAVE: Never SHOW A Man That Your A Good Woman Cus OBVIOUSLY He's Gonna Take Advantage Of It, LET HIMLEARN && EARN POINTS!
SAE: Don't ever show a man that you're a good woman because they obviously will take advantage of it, let himfind out on his own and respect you for what you are 

AAVE: I never liked Gucci. now everybody sees the royal flop ness monster! He been weak. It shouldn't have taken his breakdownfor it 2 b known
SAE: I never liked Gucci. Now everybody can sees the \"Royal Flop Ness Monster\"! He 

In [21]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/mbart-large-en-ro', device_map='auto')

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [22]:
training_args = TrainingArguments(
    output_dir='./results',          # directory for saving models and checkpoints
    num_train_epochs=5,              # number of training epochs, adjust as needed
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

> Takes ~20 minutes to train for 5 epochs over ~1.8k samples

In [24]:
trainer.train()  

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112792277776862, max=1.0…



Step,Training Loss
10,10.7234
20,10.0563
30,9.5612
40,9.3297
50,9.0001
60,8.6507
70,8.2569
80,7.7737
90,7.3633
100,6.7062


Non-default generation parameters: {'max_length': 1024, 'num_beams': 5, 'forced_eos_token_id': 2}


TrainOutput(global_step=570, training_loss=2.1884235172940976, metrics={'train_runtime': 1166.455, 'train_samples_per_second': 7.789, 'train_steps_per_second': 0.489, 'total_flos': 2461046742712320.0, 'train_loss': 2.1884235172940976, 'epoch': 5.0})

In [25]:
trainer.evaluate(eval_dataset=tokenized_test_dataset)

{'eval_loss': 0.36918875575065613,
 'eval_runtime': 9.1737,
 'eval_samples_per_second': 22.019,
 'eval_steps_per_second': 1.417,
 'epoch': 5.0}

In [26]:
model_fname = 'mbart_translate_5epochs.sav'

# Save the model to disk
pickle.dump(model, open(model_fname, 'wb'))

In [27]:
# Get the file download link manually 

%cd /kaggle/working

from IPython.display import FileLink 
FileLink(model_fname)

/kaggle/working


## Demo: AAE to SAE translation example using MBart

In [31]:
# Load the model from disk

filename = '/kaggle/working/' + model_fname
translation_model = pickle.load(open(filename, 'rb'))

In [32]:
def translate_aae_to_sae(sentence, model, tokenizer, device='cuda'):
    # Move the model to the specified device
    model.to(device)

    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move the input tensors to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate translation output
    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=256,
    )

    # Decode the output
    translated_sentence = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return translated_sentence

In [37]:
aae_sentence = "Where you been"
sae_translation = translate_aae_to_sae(aae_sentence, translation_model, tokenizer)
print(f"AAE: {aae_sentence}\nSAE: {sae_translation}")

AAE: Where you been
SAE: Where are you been?


In [38]:
aae_sentence = "Boy you can say anything you wanna I don't give a shit, noone else can have ya"
sae_translation = translate_aae_to_sae(aae_sentence, translation_model, tokenizer)
print(f"AAE: {aae_sentence}\nSAE: {sae_translation}")

AAE: Boy you can say anything you wanna I don't give a shit, noone else can have ya
SAE: You can say anything you want to, I don't give a shit. Noone else can have you
