In [2]:
import torch
import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

2024-03-17 00:43:33.122469: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-17 00:43:33.122600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-17 00:43:33.274441: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Confirm that the GPU is detected
torch.cuda.is_available()

True

In [4]:
# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 2


In [5]:
# Data used from the following source: https://aclanthology.org/2020.emnlp-main.473/ 

aave_csv = pd.read_csv("/kaggle/input/dialect-samples/aave_samples.csv",header=None)
sae_csv = pd.read_csv("/kaggle/input/dialect-samples/sae_samples.csv",header=None)

> The AAE to SAE translation samples dataset is obtained from the [Groenwold et al., EMNLP 2020](http://https://aclanthology.org/2020.emnlp-main.473/) paper.

In [6]:
aave_csv.head()

Unnamed: 0,0
0,Sooo Manti Te'o was having a online/phone rela...
1,this lil girl aint going to win im the king of...
2,He up stairs rights now and I'm down here gett...
3,Shit I Am Who Am..Fresh up out of Apologize..I...
4,It's very rare that I get what I want. Now tha...


In [7]:
sae_csv.head()

Unnamed: 0,0
0,Manti Te'o was having a relationship via telep...
1,The little girl is not going to win because i ...
2,He is upstairs rights now and I'm down here ge...
3,"Shit, I am who I am. I'm done apologizing. I'm..."
4,"It is very rare that I get what I want, but no..."


In [8]:
aave_csv.rename(columns = {0:'AAVE'},inplace = True)
sae_csv.rename(columns = {0:'SAE'},inplace = True)

In [9]:
merged_df = pd.concat([aave_csv, sae_csv], axis=1)
merged_df.head()

Unnamed: 0,AAVE,SAE
0,Sooo Manti Te'o was having a online/phone rela...,Manti Te'o was having a relationship via telep...
1,this lil girl aint going to win im the king of...,The little girl is not going to win because i ...
2,He up stairs rights now and I'm down here gett...,He is upstairs rights now and I'm down here ge...
3,Shit I Am Who Am..Fresh up out of Apologize..I...,"Shit, I am who I am. I'm done apologizing. I'm..."
4,It's very rare that I get what I want. Now tha...,"It is very rare that I get what I want, but no..."


In [10]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing
train_df, test_df = train_test_split(merged_df, test_size=0.1)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [11]:
len(train_dataset), len(test_dataset)

(1817, 202)

## Fine-tuning MBart to perform AAE-to-SAE translation

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-en-ro')

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [13]:
def tokenize_function(examples):
    # Tokenize both the AAE (input) and SAE (target) sentences.
    model_inputs = tokenizer(examples["AAVE"], max_length=128, padding="max_length", truncation=True)

    # Tokenize the labels without using return_tensors="pt" to keep them as lists
    labels = tokenizer(examples["SAE"], max_length=128, padding="max_length", truncation=True)
    
    # Update model_inputs to include labels; ensure labels are lists
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [14]:
# Mapping the tokenization function over the datasets.
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
# Inspect the first few examples from the tokenized training dataset
for i in range(5):
    print("AAVE:", tokenizer.decode(tokenized_train_dataset[i]['input_ids'], skip_special_tokens=True))
    print("SAE:", tokenizer.decode(tokenized_train_dataset[i]['labels'], skip_special_tokens=True), '\n')

AAVE: I'm doing 2 much this winter so I can stunt forthe summer...I'm such a nigger
SAE: I am doing too much this winter so that I can stunt forthe summer. I am such a black man. 

AAVE: I wish I knew how 2 erase personal messegages I sent in regrettion. Yea I regret amessage or 2 I sent.
SAE: I want a take back on my messages that i sent. The few messages. 

AAVE: I have a lot on my mind about wat is in store for me this year n in the future. Neverbeen hyped up like this.
SAE: I have a lot on my mind about what is in store for me this year and in the future. I've neverbeen excited like this 

AAVE: I wonder how many dudes getting called up to "Boo" status not knowing they beingused to Valentine Day Dates...ONLY!
SAE: I wonder how many guys who are now being called boyfriends don't realize that they used to be Valentine's Day Dates, only 

AAVE: I always expect the worst and hope for the best.. The easiestway to NOT be disappointed
SAE: I always expect the worst, and hope for the best.

In [16]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/mbart-large-en-ro', device_map='auto')

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [17]:
training_args = TrainingArguments(
    output_dir='./results',          # directory for saving models and checkpoints
    num_train_epochs=5,              # number of training epochs, adjust as needed
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

Training takes ~20 minutes to train for 5 epochs over ~1.8k samples

In [19]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
10,10.7761
20,9.9545
30,9.6621
40,9.3059
50,9.0435
60,8.7044
70,8.2695
80,7.8112
90,7.2597
100,6.6617


Non-default generation parameters: {'max_length': 1024, 'num_beams': 5, 'forced_eos_token_id': 2}


TrainOutput(global_step=570, training_loss=2.1803237739362213, metrics={'train_runtime': 1199.74, 'train_samples_per_second': 7.572, 'train_steps_per_second': 0.475, 'total_flos': 2461046742712320.0, 'train_loss': 2.1803237739362213, 'epoch': 5.0})

In [20]:
trainer.evaluate(eval_dataset=tokenized_test_dataset)

{'eval_loss': 0.42615562677383423,
 'eval_runtime': 9.5442,
 'eval_samples_per_second': 21.165,
 'eval_steps_per_second': 1.362,
 'epoch': 5.0}

In [21]:
model_fname = 'mbart_translate_5epochs.sav'

# Save the model to disk
pickle.dump(model, open(model_fname, 'wb'))

In [22]:
# Get the file download link manually 

%cd /kaggle/working

from IPython.display import FileLink 
FileLink(model_fname)

/kaggle/working


## Demo: AAE to SAE translation example using fine-tuned mBART

In [31]:
# Load the model from disk

filename = '/kaggle/working/' + model_fname
translation_model = pickle.load(open(filename, 'rb'))

In [24]:
def translate_aae_to_sae(sentence, model, tokenizer, device='cuda'):
    # Move the model to the specified device
    model.to(device)

    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move the input tensors to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate translation output
    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=256,
    )

    # Decode the output
    translated_sentence = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return translated_sentence

In [25]:
aae_sentence = "Where you been"
sae_translation = translate_aae_to_sae(aae_sentence, translation_model, tokenizer)
print(f"AAE: {aae_sentence}\nSAE: {sae_translation}")

AAE: Where you been
SAE: Where have you been


In [26]:
aae_sentence = "Boy you can say anything you wanna I don't give a shit, noone else can have ya"
sae_translation = translate_aae_to_sae(aae_sentence, translation_model, tokenizer)
print(f"AAE: {aae_sentence}\nSAE: {sae_translation}")

AAE: Boy you can say anything you wanna I don't give a shit, noone else can have ya
SAE: You can say anything you want to, I don't give a shit. Noone else can have you


In [27]:
aae_sentence = "Wassup, nigga?"
sae_translation = translate_aae_to_sae(aae_sentence, translation_model, tokenizer)
print(f"AAE: {aae_sentence}\nSAE: {sae_translation}")

AAE: Wassup, nigga?
SAE: What's up, man?


### Translate AAE samples to SAE in the working dataset

In [30]:
# Load the cleaned, preprocessed dataset

tweet_df = pd.read_csv("/kaggle/input/processed-data/Preprocessed_Data_Final.csv")
tweet_df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
tweet_df.rename(columns={'new_class': 'label', 'cleaned_tweet': 'clean_tweet'}, inplace=True)

In [36]:
def apply_translation(row):
    if row['AAVE'] == 1:
        return translate_aae_to_sae(row['clean_tweet'], translation_model, tokenizer)
    else:
        return row['clean_tweet']

In [34]:
tweet_df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,AAVE_hate_speech,AAVE,label,clean_tweet,updated_label
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0,0,!!! RT : As a woman you shouldn't complain abo...,0
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0,0,0,!!!!! RT : boy dats cold...tyga dwn bad for cu...,0
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0,0,0,!!!!!!! RT Dawg!!!! RT : You ever fuck a bitch...,0
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0,0,0,!!!!!!!!! RT : she look like a tranny,0
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0,0,0,!!!!!!!!!!!!! RT : The shit you hear about me ...,0


In [37]:
tweet_df['SAE_tweet'] = tweet_df.apply(apply_translation, axis=1)

In [38]:
tweet_df.isnull().sum()

count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
AAVE_hate_speech      0
AAVE                  0
label                 0
clean_tweet           1
updated_label         0
SAE_tweet             1
dtype: int64

In [39]:
tweet_df.dropna(inplace = True)

In [40]:
tweet_df.isnull().sum()

count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
AAVE_hate_speech      0
AAVE                  0
label                 0
clean_tweet           0
updated_label         0
SAE_tweet             0
dtype: int64

In [41]:
# Save the processed file with translations to disk 

tweet_df.to_csv('processed_tweets_with_sae.csv')

In [42]:
tweet_df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,AAVE_hate_speech,AAVE,label,clean_tweet,updated_label,SAE_tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0,0,!!! RT : As a woman you shouldn't complain abo...,0,!!! RT : As a woman you shouldn't complain abo...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0,0,0,!!!!! RT : boy dats cold...tyga dwn bad for cu...,0,!!!!! RT : boy dats cold...tyga dwn bad for cu...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0,0,0,!!!!!!! RT Dawg!!!! RT : You ever fuck a bitch...,0,!!!!!!! RT Dawg!!!! RT : You ever fuck a bitch...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0,0,0,!!!!!!!!! RT : she look like a tranny,0,!!!!!!!!! RT : she look like a tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0,0,0,!!!!!!!!!!!!! RT : The shit you hear about me ...,0,!!!!!!!!!!!!! RT : The shit you hear about me ...


In [43]:
aae_df = tweet_df[tweet_df["AAVE"]==1]
len(aae_df)

1201

In [48]:
aae_hate_sample = aae_df[aae_df['label']==1].iloc[1]
aae_not_hate_sample = aae_df[aae_df['label']==0].iloc[1]

print("AAVE not hate speech example: \n")
print("AAE text: ", aae_not_hate_sample['clean_tweet'])
print("SAE text: ", aae_not_hate_sample['SAE_tweet'])
print("\n-----------------------------------\n")

print("AAVE hate speech example: \n")
print("AAE text: ", aae_hate_sample['clean_tweet'])
print("SAE text: ", aae_hate_sample['SAE_tweet'])

AAVE not hate speech example: 

AAE text:  " if you aint bout that Murder Game pussy nigga shut up "
SAE text:  \"If you are not interested in that murder game, shut the fuck up.\

-----------------------------------

AAVE hate speech example: 

AAE text:  You aint special niglet RT : When I'm chilling wit Greg ppl walk up to me to ask me to take pics
SAE text:  You're not special black man. When I'm chilling with Greg people walk up to me to ask me to take pictures.
