In [1]:
import pandas as pd
import datasets
import numpy as np
import regex as re
import torch
from nltk.stem import PorterStemmer
from transformers import AutoTokenizer ,AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_ckpt:str='distilbert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
num_labels:int=2
data_labels=['positive', 'negative']
batch_size:int=64

In [16]:
def load_data(path):
    data=(pd.read_csv(path, index_col=0, header=[0])).reset_index(drop=True)
    data.columns=['label','text']
    data=data[['text','label']]
    return data

def preprocess_text(text):
    stemmer = PorterStemmer()
    entity_prefixes = ['@']
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                word= stemmer.stem(word)
                words.append(word)
    sentence=' '.join(words)

    # remove stock market tickers
    tweet = re.sub(r'\$\w*', '', sentence)
    # remove twitter abbreviations
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    return tweet


def split_data(data):
    train, validate, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
    return train, validate, test

def create_dateset(train,validate,test):
    train_dataset = datasets.Dataset.from_dict(train)
    test_dataset = datasets.Dataset.from_dict(test)
    validation_dataset=datasets.Dataset.from_dict(validate)
    my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"validation":validation_dataset,"test":test_dataset})
    return my_dataset_dict

def batch_tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


def train_model(data):

    sentiment_encoded = data.map(batch_tokenize, batched=True, batch_size=None)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = (AutoModelForSequenceClassification
                .from_pretrained(model_ckpt, num_labels=num_labels).to(device))
    logging_steps = len(data["train"]) // batch_size
    model_name = f"{model_ckpt}-finetuned-emotion"
    training_args = TrainingArguments(output_dir=model_name,
                                        num_train_epochs=2,
                                        learning_rate=2e-5,
                                        per_device_train_batch_size=batch_size,
                                        per_device_eval_batch_size=batch_size,
                                        weight_decay=0.01,
                                        evaluation_strategy="epoch",
                                        disable_tqdm=False,
                                        logging_steps=logging_steps,
                                        push_to_hub=False, 
                                        log_level="error")
    trainer = Trainer(model=model, args=training_args, 
                        # compute_metrics=compute_metrics,
                        train_dataset=data["train"],
                        eval_dataset=data["validation"],
                        tokenizer=tokenizer)

    trainer.train()
    return model

In [17]:
#load and transform data
data=load_data('D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv')
# data.text = [preprocess_text(data.text[i]) for i in range(len(data))]
train, validate, test = split_data(data=data)
sentiment=create_dateset(train,validate,test)

#tokenize and encode
sentiment_encoded = sentiment.map(batch_tokenize, batched=True, batch_size=None)
model=train_model(data=sentiment)


[A
100%|██████████| 1/1 [00:00<00:00,  3.77ba/s]

100%|██████████| 1/1 [00:00<00:00, 11.07ba/s]

100%|██████████| 1/1 [00:00<00:00, 10.18ba/s]

  0%|          | 0/218 [00:35<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  2.29ba/s]
100%|██████████| 1/1 [00:00<00:00, 11.41ba/s]
100%|██████████| 1/1 [00:00<00:00, 11.47ba/s]
  0%|          | 0/218 [00:00<?, ?it/s]

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [18]:
sentiment_encoded["train"].column_names

['text', 'label', 'input_ids', 'attention_mask']

In [171]:
# def strip_all_tags(text):
#     entity_prefixes = ['@']
#     words = []
#     for word in text.split():
#         word = word.strip()
#         if word:
#             if word[0] not in entity_prefixes:
#                 words.append(word)
#     return ' '.join(words)

In [172]:
# data.text = [strip_all_tags(data.text[i]) for i in range(len(data))]

In [202]:
data.label.unique()

array(['positive', 'negative'], dtype=object)