# Sentiment Analysis in Spanglish

The dataset that I used here is https://ritual-uh.github.io/sentimix2020/res.

For data processing, I will convert the dataset from CONLL stuff to Huggingface stuff.

# Data Processing

In [None]:
#Step 1: Load the text file into a list
with open('spanglish_trial.txt') as f:
    lines = f.readlines()
    #lines is a list of strings 
    #each string in the form 'meta\t1\tpositive\n'
    #split each string by '\t' and '\n'
    lines = [line.split('\t') for line in lines]
    #remove the '\n' from the last element of each string
    lines = [line[:-1] + [line[-1][:-1]] for line in lines]
    #divide everything in batches started by meta
    #each batch is a list of lists
    batches = []
    batch = []
    for line in lines:
        if line[0] == 'meta':
            batches.append(batch)
            batch = []
        batch.append(line)

In [None]:
#Output a csv file
import csv
general_dic = dict()
with open('spanglish_trial.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    sentence, mood =  "", ""
    for batch in batches:
        if len(batch) == 0: continue
        for piece in batch:
            if piece[0] == 'meta':
                mood = piece[2]
            else:
                sentence += piece[0] + ' '
        general_dic[sentence[:-1]] = mood
        sentence,mood = "", ""

In [None]:
trainfile = open('spanglish_train.csv', 'w')
testfile = open('spanglish_test.csv', 'w')
validfile = open('spanglish_valid.csv', 'w')

trainfilecsv = csv.writer(trainfile)
testfilecsv = csv.writer(testfile)
validfilecsv = csv.writer(validfile)

import random

for sentence, mood in general_dic.items():
    rand = random.random()
    if rand < 0.8:
        trainfilecsv.writerow([sentence, mood])
    elif rand < 0.9:
        testfilecsv.writerow([sentence, mood])
    else:
        validfilecsv.writerow([sentence, mood])
trainfile.close()
testfile.close()
validfile.close()

Load the dataset into huggingface dataset and do label to id.

In [None]:
#Load the csv file into information apt for training
import torch
from datasets import load_dataset
data_files = {'train': 'spanglish_train.csv', 'validation': 'spanglish_valid.csv', 'test': 'spanglish_test.csv'}
dataset = load_dataset('csv', data_files=data_files, delimiter=',', column_names=['sentence', 'label'])
label_to_id = {'positive': 1, 'negative': 0, 'neutral': 2}
dataset = dataset.map(lambda example: {'label': label_to_id[example['label']]})

In [None]:
#Stuff relevant to model
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

modelname = 'Twitter/twhin-bert-base'
tokenizer = AutoTokenizer.from_pretrained(modelname)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels=3)

In [None]:
#Let us define a function to compute the model's accuracy and F1 score
#These two steps will be performed by the model automatically
#Calculate predictions for the validation set
#predictions = trainer.predict(tokenized_datasets["validation"])
#Turn the logits into a prediction we can compare with the labels
import numpy as np

import evaluate
# Eval_preds is an eval_oredicts object. 
# It contains two keys: predictions and label and it is generated by the trainer
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training

In [None]:
#training
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test_trainer", 
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=15.0,
    save_total_limit=1
)

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)
trainer.train()