In [1]:
import pandas as pd
import datasets
import numpy as np
import regex as re
import torch
from nltk.stem import PorterStemmer
from transformers import AutoTokenizer ,AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_ckpt:str='distilbert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
# num_labels:int=2
# data_labels=['positive', 'negative']
# batch_size:int=64

In [42]:
class trainer():
    def __init__(self,
                model_ckpt = "distilbert-base-uncased",
                num_labels=2,
                batch_size = 64
                # data_path='D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv',

                ):
        
        self.model_ckpt=model_ckpt
        self.num_labels=num_labels
        self.batch_size=batch_size
        self.tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
        # self.data_path=data_path
        self.data_labels=['positive', 'negative']
        self.save_path=f"{model_ckpt}-finetuned-emotion"




    def load_data(self, path):
        data=(pd.read_csv(path, index_col=0, header=[0])).reset_index(drop=True)
        data.columns=['label','text']
        data=data[['text','label']]
        return data

    def preprocess_text(self,text):
        stemmer = PorterStemmer()
        entity_prefixes = ['@']
        words = []
        for word in text.split():
            word = word.strip()
            if word:
                if word[0] not in entity_prefixes:
                    word= stemmer.stem(word)
                    words.append(word)
        sentence=' '.join(words)

        # remove stock market tickers
        tweet = re.sub(r'\$\w*', '', sentence)
        # remove twitter abbreviations
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
        # only removing the hash # sign from the word
        tweet = re.sub(r'#', '', tweet)
        return tweet


    def split_data(self,data):
        train, validate, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
        return train, validate, test

    def create_dateset(self,train,validate,test):
        train_dataset = datasets.Dataset.from_dict(train)
        test_dataset = datasets.Dataset.from_dict(test)
        validation_dataset=datasets.Dataset.from_dict(validate)
        my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"validation":validation_dataset,"test":test_dataset})
        return my_dataset_dict

    def tokenize(self,batch):
        return self.tokenizer(batch["text"], padding=True, truncation=True)

    def compute_metrics(self,pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}

    def training(self, 
                load_path='D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv'
                
                
                
                ):


        data= self.load_data(path=load_path)
        le = LabelEncoder()
        data.label=le.fit(data.label).transform(data.label)
        data.text = [self.preprocess_text(data.text[i]) for i in range(len(data))]
        train, validate, test = self.split_data(data=data)
        sentiment=self.create_dateset(train,validate,test)

        #tokenize and encode
        sentiment_encoded = sentiment.map(self.tokenize, batched=True, batch_size=None)




        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = (AutoModelForSequenceClassification
                .from_pretrained(self.model_ckpt, num_labels=self.num_labels)
                .to(device))







        logging_steps = len(sentiment_encoded["train"]) // self.batch_size
        model_name = f"{self.model_ckpt}-finetuned-emotion"
        training_args = TrainingArguments(output_dir=model_name,
                                            num_train_epochs=2,
                                            learning_rate=2e-5,
                                            per_device_train_batch_size=self.batch_size,
                                            per_device_eval_batch_size=self.batch_size,
                                            weight_decay=0.01,
                                            evaluation_strategy="epoch",
                                            disable_tqdm=False,
                                            logging_steps=logging_steps,
                                            push_to_hub=False, 
                                            log_level="error")

        trainer = Trainer(model=model,
        args=training_args,
        compute_metrics=self.compute_metrics,
        train_dataset=sentiment_encoded["train"],
        eval_dataset=sentiment_encoded["validation"],
        tokenizer=self.tokenizer)

        # trainer
        trainer.train();
        trainer.save_model(self.save_path)
        return model
    # def train_model(data):

    #     sentiment_encoded = data.map(tokenize, batched=True, batch_size=None)
        
    #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #     model = (AutoModelForSequenceClassification
    #                 .from_pretrained(model_ckpt, num_labels=2).to(device))
    #     logging_steps = len(data["train"]) // batch_size
    #     model_name = f"{model_ckpt}-finetuned-emotion"
    #     training_args = TrainingArguments(output_dir=model_name,
    #                                         num_train_epochs=2,
    #                                         learning_rate=2e-5,
    #                                         per_device_train_batch_size=batch_size,
    #                                         per_device_eval_batch_size=batch_size,
    #                                         weight_decay=0.01,
    #                                         evaluation_strategy="epoch",
    #                                         disable_tqdm=False,
    #                                         logging_steps=logging_steps,
    #                                         push_to_hub=False, 
    #                                         log_level="error")
    #     trainer = Trainer(model=model, args=training_args, 
    #                         # compute_metrics=compute_metrics,
    #                         train_dataset=data["train"],
    #                         eval_dataset=data["validation"],
    #                         tokenizer=tokenizer)

    #     trainer.train()
    #     return model

In [43]:
obj= trainer()
model=obj.training()

100%|██████████| 1/1 [00:00<00:00,  3.59ba/s]
100%|██████████| 1/1 [00:00<00:00, 10.83ba/s]
100%|██████████| 1/1 [00:00<00:00,  8.83ba/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialize

{'loss': 0.3219, 'learning_rate': 1.0091743119266055e-05, 'epoch': 0.99}




TypeError: trainer.compute_metrics() takes 1 positional argument but 2 were given

In [6]:
# #load and transform data
# obj = trainer()

# data=obj.load_data('D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv')
# le = LabelEncoder()
# data.label=le.fit(data.label).transform(data.label)
# data.text = [obj.preprocess_text(data.text[i]) for i in range(len(data))]
# train, validate, test = obj.split_data(data=data)
# sentiment=obj.create_dateset(train,validate,test)

# #tokenize and encode
# sentiment_encoded = sentiment.map(obj.tokenize, batched=True, batch_size=None)

# model=obj.training()

100%|██████████| 1/1 [00:00<00:00,  2.73ba/s]
100%|██████████| 1/1 [00:00<00:00, 12.07ba/s]
100%|██████████| 1/1 [00:00<00:00, 11.81ba/s]


In [13]:
# hide_output
# sentiment_encoded = sentiment.map(tokenize, batched=True, batch_size=None)


  0%|          | 0/218 [03:09<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  2.40ba/s]
100%|██████████| 1/1 [00:00<00:00, 11.63ba/s]
100%|██████████| 1/1 [00:00<00:00, 12.65ba/s]


In [7]:
# from transformers import Trainer, TrainingArguments

# model_ckpt = "distilbert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# num_labels=2
# num_labels=len(np.unique(sentiment_encoded["train"]['label']))
# batch_size = 64




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [8]:
# def training():
#   device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#   model = (AutoModelForSequenceClassification
#           .from_pretrained(model_ckpt, num_labels=2)
#           .to(device))

#   logging_steps = len(sentiment_encoded["train"]) // batch_size
#   model_name = f"{model_ckpt}-finetuned-emotion"
#   training_args = TrainingArguments(output_dir=model_name,
#                                     num_train_epochs=2,
#                                     learning_rate=2e-5,
#                                     per_device_train_batch_size=batch_size,
#                                     per_device_eval_batch_size=batch_size,
#                                     weight_decay=0.01,
#                                     evaluation_strategy="epoch",
#                                     disable_tqdm=False,
#                                     logging_steps=logging_steps,
#                                     push_to_hub=False, 
#                                     log_level="error")

#   trainer = Trainer(model=model, args=training_args, 
#                   #   compute_metrics=compute_metrics,
#                     train_dataset=sentiment_encoded["train"],
#                     eval_dataset=sentiment_encoded["validation"],
#                     tokenizer=tokenizer)
#   trainer
#   trainer.train();
#   trainer.save_model(model_name)
#   return model

 50%|█████     | 109/218 [00:58<00:46,  2.33it/s]

{'loss': 0.2747, 'learning_rate': 1.0091743119266055e-05, 'epoch': 0.99}


                                                 
 50%|█████     | 109/218 [01:04<00:46,  2.33it/s]

{'eval_loss': 0.17304207384586334, 'eval_runtime': 5.5121, 'eval_samples_per_second': 418.717, 'eval_steps_per_second': 6.713, 'epoch': 1.0}


 99%|█████████▉| 216/218 [02:03<00:01,  1.81it/s]

{'loss': 0.1365, 'learning_rate': 1.8348623853211012e-07, 'epoch': 1.98}


                                                 
100%|██████████| 218/218 [02:09<00:00,  1.69it/s]


{'eval_loss': 0.16361425817012787, 'eval_runtime': 5.529, 'eval_samples_per_second': 417.433, 'eval_steps_per_second': 6.692, 'epoch': 2.0}
{'train_runtime': 129.236, 'train_samples_per_second': 107.153, 'train_steps_per_second': 1.687, 'train_loss': 0.20459835053583897, 'epoch': 2.0}
