In [2]:
!pip install transformers torch pandas




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('labelled_airline_tweet.csv')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
train_df.head(1)

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,token,topic_no,topic_prob,topic_desc
1155,1155,569551241967067136,negative,1.0,Customer Service Issue,1.0,United,,BBickmire,,0,@united really enjoying my Sunday on hold...ov...,,2015-02-22 09:36:12 -0800,Dallas,,"['enjoy', 'holdover', 'client', 'experience']",3,0.242707,Reschedule and Refund


In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        max_length=128,  # adjust as needed
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
import torch
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['topic_desc'])
val_df['label'] = label_encoder.transform(val_df['topic_desc'])
num_classes = len(label_encoder.classes_)

class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        tokenized_text = tokenize_text(text)
        return {
            'input_ids': tokenized_text['input_ids'].flatten(),
            'attention_mask': tokenized_text['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)  # Ensure labels are of type torch.long
        }

train_dataset = CustomDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = CustomDataset(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [None]:
num_classes = len(train_df['topic_desc'].unique())
num_classes

In [9]:
for batch in train_dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    # Print or inspect the data in the batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)

    # Optionally break the loop to inspect only the first batch
    break

Input IDs: tensor([[ 101, 1030, 6892,  ...,    0,    0,    0],
        [ 101, 1030, 4943,  ...,    0,    0,    0],
        [ 101, 1030, 2142,  ...,    0,    0,    0],
        ...,
        [ 101, 1030, 2142,  ...,    0,    0,    0],
        [ 101, 1030, 3915,  ...,    0,    0,    0],
        [ 101, 1030, 3915,  ...,    0,    0,    0]])
Attention Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Labels: tensor([5, 5, 5, 5, 4, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 4, 5,
        4, 0, 0, 3, 5, 0, 5, 0])


In [1]:
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(5):
    # Create a tqdm progress bar for the training dataloader
    train_dataloader_with_progress = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{5}', leave=False)

    for batch in train_dataloader_with_progress:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Update the progress bar with the current loss
        train_dataloader_with_progress.set_postfix(loss=loss.item(), refresh=True)

    # Evaluation
    model.eval()
# Evaluate on validation set


KeyboardInterrupt: 