In [None]:
# !pip --q install transformers
# !pip --q install datasets
# !pip --q install accelerate evaluate

## Configuration Setup

In [None]:
import torch
class Config:
    DATASET_ID = "emad12/stock_tweets_sentiment"
    MODEL_CHECKPOINT = "distilbert-base-uncased"
    SOURCE_COLUMN = "tweet"
    TARGET_COLUMN = "sentiment"
    TEST_SIZE = 0.2
    SEED = 0
    MAX_LENGTH = 32
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    ID2LABEL = {0: "NEUTRAL", 1: "POSITIVE", 2: "NEGATIVE"}
    LABEL2ID = {"NEUTRAL" : 0, "POSITIVE" : 1, "NEGATIVE": 2}
    EVALUATION_METRIC = "accuracy"
    MODEL_OUTPUT_DIR = "distilbert-stock-tweet-sentiment-analysis"
    NUMBER_EPOCHS = 3
    LR = 2E-5
    BATCH_SIZE = 16
    WEIGHT_DECAY = 0.01
    EVALUATION_STRATEGY = "epoch"
    SAVE_STRATEGY = "epoch"
    LOGGING_STRATEGY = "epoch"
    PUSH_TO_HUB = True

config = Config()

## Dataset Prepration

In [None]:
#@ IMPORTING THE REQUIRED LIBRARIES AND DEPENDENCIES
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch
import evaluate

In [None]:
class TextClassificationDataset:

    def __init__(self):
        self.dataset_id = config.DATASET_ID
        self.model_checkpoint = config.MODEL_CHECKPOINT
        self.source_column = config.SOURCE_COLUMN
        self.target_column = config.TARGET_COLUMN
        self.test_size = config.TEST_SIZE
        self.seed = config.SEED
        self.max_len = config.MAX_LENGTH
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)

    def create_data(self):
        self.dataset = load_dataset(self.dataset_id, split="train")
        self.data = self.dataset.to_pandas()
        self.data = self.data[[self.source_column, self.target_column]]
        self.data[self.target_column] = self.data[self.target_column].apply(lambda x: 2 if x == -1 else x)
        self.data[self.source_column] =  self.data[self.source_column].apply(lambda x: x.lower())                       # lowercasing the dataset
        self.data = self.data.sample(20000)
        self.train_data, self.test_data = train_test_split(self.data, test_size=self.test_size, shuffle=True, random_state=self.seed, stratify=self.data[self.target_column])
        self.train_df = Dataset.from_pandas(self.train_data)
        self.test_df = Dataset.from_pandas(self.test_data)
        return self.train_df, self.test_df


    def tokenize_function(self, example):
        model_input = self.tokenizer(example[self.source_column], truncation=True, padding=True, max_length=self.max_len)
        labels = torch.tensor(example[self.target_column], dtype=torch.int)
        model_input["labels"] = labels
        return model_input

    def preprocess_function(self, data):
        model_input = data.map(self.tokenize_function, batched=True, remove_columns=data.column_names)
        return model_input

    def gen_classification_dataset(self):
        train_df, test_df = self.create_data()
        train_tokenized_data = self.preprocess_function(train_df)
        test_tokenized_data = self.preprocess_function(test_df)
        return train_tokenized_data, test_tokenized_data


## Model Prepration for Training

In [None]:
class TextClassificationModelTrainer:
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
        self.model_checkpoint = config.MODEL_CHECKPOINT
        self.id2label = config.ID2LABEL
        self.label2id = config.LABEL2ID
        self.number_labels = len(self.id2label)
        self.device = config.DEVICE
        self.evaluation_metric = config.EVALUATION_METRIC
        self.model_output_dir = config.MODEL_OUTPUT_DIR
        self.number_epochs = config.NUMBER_EPOCHS
        self.lr = config.LR
        self.batch_size = config.BATCH_SIZE
        self.weight_decay = config.WEIGHT_DECAY
        self.evaluation_strategy = config.EVALUATION_STRATEGY
        self.save_strategy = config.SAVE_STRATEGY
        self.logging_strategy = config.LOGGING_STRATEGY
        self.push_to_hub = config.PUSH_TO_HUB
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_checkpoint,
                                                                        id2label = self.id2label,
                                                                        label2id = self.label2id,
                                                                        num_labels = self.number_labels
                                                                        ).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
        self.evaluation_metric_computer = evaluate.load(self.evaluation_metric)
        self.data_collator = DataCollatorWithPadding(self.tokenizer)

    def compute_metrics(self, evaluation_prediction):
        predictions, labels = evaluation_prediction
        predictions = np.argmax(predictions, axis=1)
        return self.evaluation_metric_computer.compute(predictions=predictions, references=labels)

    def set_training_arguments(self):
        return TrainingArguments(
            output_dir = self.model_output_dir,
            num_train_epochs = self.number_epochs,
            learning_rate = self.lr,
            per_device_train_batch_size = self.batch_size,
            per_device_eval_batch_size = self.batch_size,
            weight_decay = self.weight_decay,
            evaluation_strategy = self.evaluation_strategy,
            save_strategy = self.save_strategy,
            logging_strategy = self.logging_strategy,
            push_to_hub = self.push_to_hub
        )

    def model_trainer(self):
        return Trainer(
            model = self.model,
            args = self.set_training_arguments(),
            data_collator = self.data_collator,
            train_dataset = self.train_df,
            eval_dataset = self.test_df,
            compute_metrics = self.compute_metrics
        )

    def train_and_save_and_push_to_hub(self):
        trainer = self.model_trainer()
        trainer.train()
        trainer.push_to_hub

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

## Main

In [None]:
if __name__ == "__main__":
    textclassificationdataset = TextClassificationDataset()
    train_df, test_df = textclassificationdataset.gen_classification_dataset()
    textclassificationtrainer = TextClassificationModelTrainer(train_df, test_df)
    textclassificationtrainer.train_and_save_and_push_to_hub()

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/content/distilbert-stock-tweet-sentiment-analysis is already a clone of https://huggingface.co/regmisaugat59/distilbert-stock-tweet-sentiment-analysis. Make sure you pull the latest changes with `repo.git_pull()`.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6807,0.607106,0.759
2,0.4832,0.587172,0.76225
3,0.3714,0.633653,0.766


## Inference

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis",
                      model = config.MODEL_OUTPUT_DIR,
                      tokenizer = "distilbert-base-uncased")

classifier("I have enjoying my vacation and will be back in few days")

[{'label': 'POSITIVE', 'score': 0.9810577630996704}]

**The End**