In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/distilbertbaseuncased/config.json
/kaggle/input/distilbertbaseuncased/pytorch_model.bin
/kaggle/input/distilbertbaseuncased/vocab.txt
/kaggle/input/twitter-airline-sentiment/Tweets.csv
/kaggle/input/twitter-airline-sentiment/database.sqlite


In [2]:
!pip install transformers




In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report


In [4]:
df = pd.read_csv('/kaggle/input/twitter-airline-sentiment/Tweets.csv')
df = df[['text', 'airline_sentiment']]
df.head()


Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [5]:
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['airline_sentiment'].map(label_map)


In [6]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("/kaggle/input/distilbertbaseuncased")

# Load model (set num_labels to your number of sentiment classes, e.g., 3 for positive/neutral/negative)
model = DistilBertForSequenceClassification.from_pretrained(
    "/kaggle/input/distilbertbaseuncased",
    num_labels=3
)

# Tokenize and encode the data
tokens = tokenizer(
    list(df['text']),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
labels = torch.tensor(df['label'].values)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/distilbertbaseuncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tokens = tokenizer(
    list(df['text']),
    max_length=64,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
labels = torch.tensor(df['label'].values)


In [8]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

X_train, X_val, mask_train, mask_val, y_train, y_val = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

train_dataset = TensorDataset(X_train, mask_train, y_train)
val_dataset = TensorDataset(X_val, mask_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [9]:
import torch
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)




In [10]:
from tqdm import tqdm

epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")


100%|██████████| 732/732 [00:45<00:00, 16.07it/s]


Epoch 1/3 - Loss: 382.1061


100%|██████████| 732/732 [00:44<00:00, 16.27it/s]


Epoch 2/3 - Loss: 239.5683


100%|██████████| 732/732 [00:44<00:00, 16.27it/s]

Epoch 3/3 - Loss: 150.2135





In [11]:
model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


In [12]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predictions, target_names=['negative', 'neutral', 'positive']))


              precision    recall  f1-score   support

    negative       0.89      0.93      0.91      1889
     neutral       0.74      0.59      0.65       580
    positive       0.75      0.85      0.80       459

    accuracy                           0.85      2928
   macro avg       0.80      0.79      0.79      2928
weighted avg       0.84      0.85      0.84      2928



In [13]:
torch.save(model.state_dict(), "distilbert_sentiment_model.pt")
