In [1]:
import pandas as pd
import datasets
import numpy as np
import regex as re
import torch
from nltk.stem import PorterStemmer
from transformers import AutoTokenizer ,AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# model_ckpt:str='distilbert-base-uncased'
# tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
# num_labels:int=2
# data_labels=['positive', 'negative']
# batch_size:int=64

In [49]:
class trainer():
    def __init__(self,
                model_ckpt = "distilbert-base-uncased",
                num_labels=2,
                batch_size = 64,
                num_epochs=2,
                # data_path='D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv',
                save_path="distilbert-base-uncased-finetuned-emotion"
                ):
        
        self.model_ckpt=model_ckpt
        self.num_labels=num_labels
        self.batch_size=batch_size
        self.tokenizer=AutoTokenizer.from_pretrained(model_ckpt)
        # self.data_path=data_path
        self.data_labels=['positive', 'negative']
        self.save_path=save_path
        self.num_epochs= num_epochs



    def load_data(self, path):
        data=(pd.read_csv(path, index_col=0, header=[0])).reset_index(drop=True)
        data.columns=['label','text']
        data=data[['text','label']]
        return data

    def preprocess_text(self,text):
        stemmer = PorterStemmer()
        entity_prefixes = ['@']
        words = []
        for word in text.split():
            word = word.strip()
            if word:
                if word[0] not in entity_prefixes:
                    word= stemmer.stem(word)
                    words.append(word)
        sentence=' '.join(words)

        # remove stock market tickers
        tweet = re.sub(r'\$\w*', '', sentence)
        # remove twitter abbreviations
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
        # only removing the hash # sign from the word
        tweet = re.sub(r'#', '', tweet)
        return tweet


    def split_data(self,data):
        train, validate, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
        return train, validate, test

    def create_dateset(self,train,validate,test):
        train_dataset = datasets.Dataset.from_dict(train)
        test_dataset = datasets.Dataset.from_dict(test)
        validation_dataset=datasets.Dataset.from_dict(validate)
        my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"validation":validation_dataset,"test":test_dataset})
        return my_dataset_dict

    def tokenize(self,batch):
        return self.tokenizer(batch["text"], padding=True, truncation=True)

    def compute_metrics(self,pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}
    

    def training(self, 
                load_path='D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv'
                
                
                ):


        data= self.load_data(path=load_path)
        le = LabelEncoder()
        data.label=le.fit(data.label).transform(data.label)
        data.text = [self.preprocess_text(data.text[i]) for i in range(len(data))]
        train, validate, test = self.split_data(data=data)
        sentiment=self.create_dateset(train,validate,test)

        #tokenize and encode
        sentiment_encoded = sentiment.map(self.tokenize, batched=True, batch_size=None)




        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = (AutoModelForSequenceClassification
                .from_pretrained(self.model_ckpt, num_labels=self.num_labels)
                .to(device))







        logging_steps = len(sentiment_encoded["train"]) // self.batch_size
        model_name = f"{self.model_ckpt}-finetuned-emotion"
        training_args = TrainingArguments(output_dir=model_name,
                                            num_train_epochs=self.num_epochs,
                                            learning_rate=2e-5,
                                            per_device_train_batch_size=self.batch_size,
                                            per_device_eval_batch_size=self.batch_size,
                                            weight_decay=0.01,
                                            evaluation_strategy="epoch",
                                            disable_tqdm=False,
                                            logging_steps=logging_steps,
                                            push_to_hub=False, 
                                            log_level="error")

        trainer = Trainer(model=model,
        args=training_args,
        compute_metrics=self.compute_metrics,
        train_dataset=sentiment_encoded["train"],
        eval_dataset=sentiment_encoded["validation"],
        tokenizer=self.tokenizer)

        # trainer
        trainer.train();
        trainer.save_model(self.save_path)
        return model
    # def train_model(data):

    #     sentiment_encoded = data.map(tokenize, batched=True, batch_size=None)
        
    #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #     model = (AutoModelForSequenceClassification
    #                 .from_pretrained(model_ckpt, num_labels=2).to(device))
    #     logging_steps = len(data["train"]) // batch_size
    #     model_name = f"{model_ckpt}-finetuned-emotion"
    #     training_args = TrainingArguments(output_dir=model_name,
    #                                         num_train_epochs=2,
    #                                         learning_rate=2e-5,
    #                                         per_device_train_batch_size=batch_size,
    #                                         per_device_eval_batch_size=batch_size,
    #                                         weight_decay=0.01,
    #                                         evaluation_strategy="epoch",
    #                                         disable_tqdm=False,
    #                                         logging_steps=logging_steps,
    #                                         push_to_hub=False, 
    #                                         log_level="error")
    #     trainer = Trainer(model=model, args=training_args, 
    #                         # compute_metrics=compute_metrics,
    #                         train_dataset=data["train"],
    #                         eval_dataset=data["validation"],
    #                         tokenizer=tokenizer)

    #     trainer.train()
    #     return model

In [50]:
obj= trainer(num_epochs=10)
model=obj.training()


 50%|█████     | 109/218 [12:30<12:30,  6.88s/it]

100%|██████████| 1/1 [00:00<00:00,  2.35ba/s]
100%|██████████| 1/1 [00:00<00:00, 12.10ba/s]
100%|██████████| 1/1 [00:00<00:00, 10.95ba/s]
 10%|█         | 109/1090 [00:48<05:59,  2.73it/s]

{'loss': 0.31, 'learning_rate': 1.801834862385321e-05, 'epoch': 0.99}



 10%|█         | 109/1090 [00:54<05:59,  2.73it/s]

{'eval_loss': 0.1895940899848938, 'eval_accuracy': 0.9298093587521664, 'eval_f1': 0.9292327898119033, 'eval_runtime': 6.0575, 'eval_samples_per_second': 381.018, 'eval_steps_per_second': 6.108, 'epoch': 1.0}


 20%|█▉        | 216/1090 [01:44<06:45,  2.16it/s]

{'loss': 0.1575, 'learning_rate': 1.6036697247706424e-05, 'epoch': 1.98}


 20%|██        | 218/1090 [01:44<05:23,  2.70it/s]
 20%|██        | 218/1090 [01:51<05:23,  2.70it/s]

{'eval_loss': 0.17893214523792267, 'eval_accuracy': 0.9341421143847487, 'eval_f1': 0.932725228641297, 'eval_runtime': 6.7008, 'eval_samples_per_second': 344.437, 'eval_steps_per_second': 5.522, 'epoch': 2.0}


 30%|██▉       | 324/1090 [02:40<06:06,  2.09it/s]

{'loss': 0.1055, 'learning_rate': 1.4055045871559633e-05, 'epoch': 2.97}


 30%|███       | 327/1090 [02:41<04:44,  2.68it/s]
 30%|███       | 327/1090 [02:48<04:44,  2.68it/s]

{'eval_loss': 0.17911511659622192, 'eval_accuracy': 0.938474870017331, 'eval_f1': 0.9386685488818863, 'eval_runtime': 6.2029, 'eval_samples_per_second': 372.086, 'eval_steps_per_second': 5.965, 'epoch': 3.0}


 40%|███▉      | 432/1090 [03:35<04:50,  2.26it/s]

{'loss': 0.0698, 'learning_rate': 1.2073394495412844e-05, 'epoch': 3.96}


 40%|████      | 436/1090 [03:37<03:46,  2.88it/s]
 40%|████      | 436/1090 [03:43<03:46,  2.88it/s]

{'eval_loss': 0.19558656215667725, 'eval_accuracy': 0.9371750433275563, 'eval_f1': 0.937684060570436, 'eval_runtime': 6.1521, 'eval_samples_per_second': 375.156, 'eval_steps_per_second': 6.014, 'epoch': 4.0}


 50%|████▉     | 540/1090 [04:33<03:59,  2.29it/s]

{'loss': 0.0433, 'learning_rate': 1.0091743119266055e-05, 'epoch': 4.95}


 50%|█████     | 545/1090 [04:35<03:08,  2.90it/s]
 50%|█████     | 545/1090 [04:41<03:08,  2.90it/s]

{'eval_loss': 0.2245846688747406, 'eval_accuracy': 0.9367417677642981, 'eval_f1': 0.9363813713137306, 'eval_runtime': 5.9145, 'eval_samples_per_second': 390.23, 'eval_steps_per_second': 6.256, 'epoch': 5.0}


 59%|█████▉    | 648/1090 [05:27<03:14,  2.27it/s]

{'loss': 0.0281, 'learning_rate': 8.110091743119266e-06, 'epoch': 5.94}


 60%|██████    | 654/1090 [05:29<02:33,  2.84it/s]
 60%|██████    | 654/1090 [05:35<02:33,  2.84it/s]

{'eval_loss': 0.2509334683418274, 'eval_accuracy': 0.9380415944540728, 'eval_f1': 0.9380168775915559, 'eval_runtime': 6.0223, 'eval_samples_per_second': 383.245, 'eval_steps_per_second': 6.144, 'epoch': 6.0}


 69%|██████▉   | 756/1090 [06:21<02:27,  2.27it/s]

{'loss': 0.0171, 'learning_rate': 6.128440366972478e-06, 'epoch': 6.94}


 70%|███████   | 763/1090 [06:24<01:56,  2.81it/s]
 70%|███████   | 763/1090 [06:30<01:56,  2.81it/s]

{'eval_loss': 0.2650224566459656, 'eval_accuracy': 0.9380415944540728, 'eval_f1': 0.9382125165868329, 'eval_runtime': 6.1942, 'eval_samples_per_second': 372.606, 'eval_steps_per_second': 5.973, 'epoch': 7.0}


 79%|███████▉  | 864/1090 [07:15<01:40,  2.25it/s]

{'loss': 0.0151, 'learning_rate': 4.1467889908256885e-06, 'epoch': 7.93}


 80%|████████  | 872/1090 [07:18<01:16,  2.85it/s]
 80%|████████  | 872/1090 [07:24<01:16,  2.85it/s]

{'eval_loss': 0.27845171093940735, 'eval_accuracy': 0.938474870017331, 'eval_f1': 0.9385237339637817, 'eval_runtime': 6.1092, 'eval_samples_per_second': 377.794, 'eval_steps_per_second': 6.056, 'epoch': 8.0}


 89%|████████▉ | 972/1090 [08:11<00:53,  2.22it/s]

{'loss': 0.0102, 'learning_rate': 2.1651376146788996e-06, 'epoch': 8.92}


 90%|█████████ | 981/1090 [08:15<00:41,  2.65it/s]
 90%|█████████ | 981/1090 [08:21<00:41,  2.65it/s]

{'eval_loss': 0.2808539569377899, 'eval_accuracy': 0.9393414211438474, 'eval_f1': 0.9388943568875455, 'eval_runtime': 6.2388, 'eval_samples_per_second': 369.944, 'eval_steps_per_second': 5.931, 'epoch': 9.0}


 99%|█████████▉| 1080/1090 [09:09<00:04,  2.22it/s]

{'loss': 0.0081, 'learning_rate': 1.8348623853211012e-07, 'epoch': 9.91}


100%|██████████| 1090/1090 [09:13<00:00,  2.87it/s]
100%|██████████| 1090/1090 [09:20<00:00,  1.95it/s]


{'eval_loss': 0.2881094515323639, 'eval_accuracy': 0.9389081455805892, 'eval_f1': 0.9388348084525606, 'eval_runtime': 6.486, 'eval_samples_per_second': 355.844, 'eval_steps_per_second': 5.705, 'epoch': 10.0}
{'train_runtime': 560.2245, 'train_samples_per_second': 123.593, 'train_steps_per_second': 1.946, 'train_loss': 0.07579997226151577, 'epoch': 10.0}


In [2]:
model_ckpt = "distilbert-base-uncased",
num_labels=2,
batch_size = 64,
num_epochs=2,
# data_path='D:\Codes\sentiment-fastapi/airline_sentiment_analysis.csv',
save_path="distilbert-base-uncased-finetuned-emotion"
load_path='D:\Codes\sentiment-fastapi\distilbert-base-uncased-finetuned-emotion'

In [58]:
def load_model():
    model = (AutoModelForSequenceClassification
            .from_pretrained(model_ckpt="distilbert-base-uncased", num_labels=num_labels))
    checkpoint=torch.load(load_path,
                        map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint)   
    return model

In [3]:
# new_model=load_model()

model = (AutoModelForSequenceClassification
        .from_pretrained(model_ckpt, num_labels=num_labels)
        .to('cpu'))

OSError: ('distilbert-base-uncased',) is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.