# Imports
from IPython.display import HTML          
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split


In [2]:
# Left align data in pd tables
HTML("""
<style>
    th, td {
        text-align: left !important;
    }
</style>
""")

In [3]:
# Import data and clean 
# Data is from https://www.kaggle.com/datasets/arshkandroo/behavioural-tweets?select=Lonely_Tweets.csv.
# Tweepy API was used to scrape twitter for lonely and normal tweets. The quality is okay but 
# some misclassification. The tweets are already cleaned using the NLTK library according to the
# data collectors.

df_normal = pd.read_csv('Data/Normal_Tweets.csv.xls')
del df_normal['Unnamed: 0']
df_normal.columns.values[0] = 'tweet'
df_normal['label'] = 0                                  # 0 for normal tweets 
df_normal['id'] = range(1, len(df_normal) + 1)

df_lonely = pd.read_csv('Data/Lonely_Tweets.csv.xls')
del df_lonely['Unnamed: 0']
df_lonely.columns.values[0] = 'tweet'
df_lonely['label'] = 1                                  # 1 for lonely tweets
df_lonely['id'] = range(len(df_normal) + 1, len(df_normal) + len(df_lonely) + 1)

df = pd.concat([df_normal, df_lonely])
df['tweet'] = df['tweet'].astype(pd.StringDtype())

In [4]:
# Display summary data
print(f"The data set has {len(df)} tweets in it.")
print()

print(f"There are {df.shape[1]} variables and they are as follows: ")
print(df.dtypes)
print()

print("The first and last five rows of the table are:")
pd.concat([df.head(), df.tail()])


The data set has 18447 tweets in it.

There are 3 variables and they are as follows: 
tweet    string[python]
label             int64
id                int64
dtype: object

The first and last five rows of the table are:


Unnamed: 0,tweet,label,id
0,remember hillary email non secure server,0,1
1,cant avoid demon,0,2
2,por fin la pusieron en spotify losing way de f...,0,3
3,kills,0,4
4,thank introduce important gunsense law make co...,0,5
8517,love son get paternity test person sex doesnt ...,1,18443
8518,tell father dont want bother need talk god cau...,1,18444
8519,lrt leave alone finally found need cry love ha...,1,18445
8520,know something dont need one love want want wa...,1,18446
8521,dont need want meet want several moment side,1,18447


In [5]:
# Split into training and validation set
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Create tensors of tokenized tweets and labels
train_encoding = tokenizer(
    train_df['tweet'].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

val_encoding = tokenizer(
    val_df['tweet'].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

train_labels = torch.tensor(train_df['label'].tolist())
val_labels = torch.tensor(val_df['label'].tolist())

train_encoding['labels'] = train_labels
val_encoding['labels'] = val_labels

# Convert tokenized data into usable format 
train_dataset = TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encoding['input_ids'], val_encoding['attention_mask'], val_labels)

In [9]:
print(train_dataset[:1])
print(type(train_dataset))

(tensor([[  101,  3696,  3661, 20713,  3423,  6288,  2280,  2880, 10825, 10153,
         11292,  8415, 27885,  3669,  3351,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]]), tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), tensor([0]))
<class 'torch.utils.data.dataset.TensorDataset'>


In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',           
    num_train_epochs=3,               
    per_device_train_batch_size=8,    
    per_device_eval_batch_size=16,    
    logging_dir='./logs',             
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  
)

In [12]:
trainer.train()

  0%|          | 0/5535 [00:00<?, ?it/s]

TypeError: vars() argument must have __dict__ attribute

In [None]:
# # Tokenize new data
# new_data = ["This is a new tweet", "Another sample tweet"]
# new_encoding = tokenizer(new_data, padding=True, truncation=True, max_length=128, return_tensors='pt')

# # Use the model to predict
# outputs = model(**new_encoding)
# predictions = torch.argmax(outputs.logits, dim=-1)