In [None]:
!pip install datasets
!pip install tokenizers
!pip install transformers

In [None]:
from datasets import load_dataset
dataset_train = load_dataset("mteb/tweet_sentiment_extraction",split='train')
dataset_test = load_dataset("mteb/tweet_sentiment_extraction",split='test')

In [3]:
import pandas as pd
dataset_train1=pd.DataFrame(dataset_train)
dataset_test1=pd.DataFrame(dataset_test)

In [None]:
dataset_train1.head()

In [7]:
def give_me_text_and_labels(input_csv=dataset_train1):
    df = input_csv
    
    df['label'] = [0 if x=="negative" else 1 for x in df['label_text'] ]
    return df['text'].values, df['label'].values

def give_me_text_and_labels1(input_csv=dataset_test1):
    df = input_csv
    
    df['label'] = [0 if x=="negative" else 1 for x in df['label_text'] ]
    return df['text'].values, df['label'].values

In [8]:
train_texts, train_labels = give_me_text_and_labels()
test_texts, test_labels = give_me_text_and_labels1()

In [9]:
print(train_labels,test_labels)

[1 0 0 ... 1 1 1] [1 1 0 ... 0 1 1]


In [10]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
type(train_texts), type(list(train_texts))

(numpy.ndarray, list)

In [16]:
from sklearn.model_selection import train_test_split

test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=.5)

In [17]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [18]:
import torch

class IMDBdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [19]:
train_dataset = IMDBdataset(train_encodings, train_labels)
test_dataset = IMDBdataset(test_encodings,test_labels)
val_dataset = IMDBdataset(val_encodings, val_labels)

In [20]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model = model.to(device=device)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [None]:
model.train()

In [23]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)

In [24]:
train_dataset

<__main__.IMDBdataset at 0x7fa7791aa2d0>

In [25]:
optim = AdamW(model.parameters(),lr=5e-5)



In [26]:
from tqdm import tqdm
for epoch in range(3):
    for batch in tqdm(train_dataloader):
        optim.zero_grad()
        input_ids= batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
    print(f"Loss for epoch {epoch} is {loss}")


100%|██████████| 1718/1718 [04:28<00:00,  6.41it/s]


Loss for epoch 0 is 0.4343155026435852


100%|██████████| 1718/1718 [04:28<00:00,  6.39it/s]


Loss for epoch 1 is 0.19126223027706146


100%|██████████| 1718/1718 [04:28<00:00,  6.40it/s]

Loss for epoch 2 is 0.005110918544232845





In [None]:
model.eval()

In [28]:
save_directory = "/content/sample_data/trainer1"

In [29]:
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [37]:
from transformers import pipeline
#classifier =pipeline('sentiment-analysis',model=model, tokenizer=tokenizer)
#classifier = pipeline('sentiment-analysis', model="/content/sample_data/trainer")
classifier = pipeline('sentiment-analysis', model="/content/sample_data/trainer1")

In [47]:
classifier("")

[{'label': 'LABEL_0', 'score': 0.9885736703872681}]