In [4]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.optim import AdamW
import time

Untrained model:

In [5]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = model.to('cuda')  # if GPU is available

# Load validation data
val_data = pd.read_csv('/kaggle/input/test-data/test.csv')
val_texts = val_data['review'].tolist()
val_labels = val_data['sentiment'].map({'positive': 1, 'negative': 0}).tolist()

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch dataset for validation
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

val_dataset = ReviewDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Predict with the model
model.eval()
predictions = []
true_labels = []
for batch in val_loader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    labels = batch['labels'].to('cuda')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(predicted_labels)
    true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')
print(f'Confusion matrix:\n {conf_matrix}')

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Accuracy: 0.50325
F1-score: 0.6695493098287044
Confusion matrix:
 [[    0  9935]
 [    0 10065]]


Trained Model:

In [6]:
# Record start time
start_time = time.time()

# Load data
train_data = pd.read_csv('/kaggle/input/train-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/test-data/test.csv')

train_data['sentiment'] = train_data['sentiment'].map({'positive': 1, 'negative': 0})
train_texts = train_data['review'].tolist()
train_labels = train_data['sentiment'].tolist()

test_data['sentiment'] = test_data['sentiment'].map({'positive': 1, 'negative': 0})
val_texts = test_data['review'].tolist()
val_labels = test_data['sentiment'].tolist()

#data = pd.read_csv('train.csv')
#data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
#reviews = data['review'].tolist()
#labels = data['sentiment'].tolist()

# Split data into training and validation sets
#train_texts, val_texts, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch dataset
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataloaders
train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = model.to('cuda')  # if GPU is available

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the model
model.save_pretrained('sentiment_model_DistilBERT')

# Record end time
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time required to fine-tune:  4671.2118899822235


In [9]:
from transformers import DistilBertTokenizerFast

In [10]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('sentiment_model_DistilBERT')
model = model.to('cuda')  # if GPU is available

# Load validation data
val_data = pd.read_csv('/kaggle/input/test-data/test.csv')
val_texts = val_data['review'].tolist()
val_labels = val_data['sentiment'].map({'positive': 1, 'negative': 0}).tolist()  # convert sentiment to numeric

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize data
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch dataset for validation
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

val_dataset = ReviewDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Evaluate the model
model.eval()
predictions = []
true_labels = []
for batch in val_loader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    labels = batch['labels'].to('cuda')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(predicted_labels)
    true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')
print(f'Confusion matrix:\n {conf_matrix}')

Accuracy: 0.9308
F1-score: 0.9327894327894327
Confusion matrix:
 [[9012  923]
 [ 461 9604]]
