In [None]:
# !pip install transformers

import pandas as pd
from sklearn.model_selection import train_test_split


import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

## Data

In [None]:
!wget https://raw.githubusercontent.com/YJiangcm/SST-2-sentiment-analysis/master/data/train.tsv

In [None]:
data = pd.read_csv("train.tsv", sep="\t", header=None)
data.columns = ["label", "text"]

In [None]:
x = data["text"].values
y = data["label"].values

# Split into training and validation sets

train_data, val_data, train_labels, val_labels = train_test_split(x, y)

In [None]:
# Load pre-trained DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# Tokenize

train_tokens = tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=64)

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_tokens, train_labels)
val_dataset = ClassificationDataset(val_tokens, val_labels)

## Train

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

In [None]:
# Train the model
idx = 0
model_path = f'gdrive/MyDrive/model_{idx}'

training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    num_train_epochs=1, 
    evaluation_strategy="epoch"             # total number of training epochs
)

# Trainer object 

trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,        
    eval_dataset=val_dataset             
)

trainer.train()

## Evaluate

In [None]:
trainer.evaluate()

## Save

In [None]:
# Save the model

!mkdir sentiment_model
trainer.save_model("sentiment_model")

## Predict

In [None]:
test_model = DistilBertForSequenceClassification.from_pretrained("sentiment_model")

In [None]:
test_sentence = "I am happy!"
test_encodings = tokenizer(test_sentence, return_tensors="pt", padding=True, truncation=True, max_length=64)

In [None]:
pred = test_model(**test_encodings)
probabilities = torch.softmax(pred["logits"], dim=1)
probabilities