In [12]:
# Imports
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"


In [2]:
# Import data and clean 
# Data is from https://www.kaggle.com/datasets/arshkandroo/behavioural-tweets?select=Lonely_Tweets.csv.
# Tweepy API was used to scrape twitter for lonely and normal tweets. The quality is okay but 
# some misclassification. The tweets are already cleaned using the NLTK library according to the
# data collectors.

df_normal = pd.read_csv('Data/Normal_Tweets.csv.xls')
del df_normal['Unnamed: 0']
df_normal.columns.values[0] = 'tweet'
df_normal['label'] = 0                                  # 0 for normal tweets 

df_lonely = pd.read_csv('Data/Lonely_Tweets.csv.xls')
del df_lonely['Unnamed: 0']
df_lonely.columns.values[0] = 'tweet'
df_lonely['label'] = 1                                  # 1 for lonely tweets



In [20]:
keywords = ["alone", "lonely", "no one"]
filtered_df = df_lonely[df_lonely['tweet'].str.contains('|'.join(keywords), case=False, na=False)]

df = pd.concat([df_normal, filtered_df])

In [21]:
# Display summary data
print(f"The data set has {len(df)} tweets in it.")
print()

print(f"There are {df.shape[1]} variables and they are as follows: ")
print(df.dtypes)
print()

print("The first and last five rows of the table are:")
pd.concat([df.head(), df.tail()])


The data set has 11240 tweets in it.

There are 3 variables and they are as follows: 
tweet    object
label     int64
id        int64
dtype: object

The first and last five rows of the table are:


Unnamed: 0,tweet,label,id
0,remember hillary email non secure server,0,1
1,cant avoid demon,0,2
2,por fin la pusieron en spotify losing way de f...,0,3
3,kills,0,4
4,thank introduce important gunsense law make co...,0,5
8504,love realize day dont want bother nothing wron...,1,18430
8507,hi kerry believe alone please know thousand pe...,1,18433
8513,even though mean gotta let go yeah dependent y...,1,18439
8514,claire please message ever need talk lonely,1,18440
8519,lrt leave alone finally found need cry love ha...,1,18445


In [22]:
# Split into training and validation set
dataset = Dataset.from_pandas(df)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

final_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': test_valid_split['test'],
    'validation': test_valid_split['train']
})

In [23]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [24]:
def tokenize_function(examples):
    return tokenizer(examples['tweet'], truncation=True, padding='max_length', max_length=20)

tokenized_dataset = final_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/8992 [00:00<?, ? examples/s]

Map:   0%|          | 0/1124 [00:00<?, ? examples/s]

Map:   0%|          | 0/1124 [00:00<?, ? examples/s]

In [25]:
print(tokenized_dataset['train']['input_ids'])

[[101, 3231, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 3239, 2467, 3608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2613, 15902, 5082, 2279, 3357, 2002, 2015, 2175, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2202, 2835, 11688, 9544, 8231, 3462, 3607, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 11274, 5160, 6271, 2160, 10489, 2553, 2300, 10142, 23408, 2483, 19357, 2618, 102, 0, 0, 0, 0, 0, 0], [101, 4139, 11884, 2610, 2197, 2051, 6583, 14713, 2158, 2387, 5195, 10618, 15882, 102, 0, 0, 0, 0, 0, 0], [101, 2648, 11700, 4843, 2330, 21945, 2396, 14192, 9256, 2404, 2362, 2839, 3465, 2226, 102, 0, 0, 0, 0, 0], [101, 2176, 3586, 2386, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 10107, 14269, 2293, 2490, 10107, 2296, 2978, 7098, 2112, 1054, 2243, 5737, 102, 0, 0, 0, 0, 0, 0], [101, 4299, 3056, 2111, 2052, 4553, 2681, 2894, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 4202, 9170, 2713, 13216, 2544, 2092, 5670, 21182, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",  # Enables validation at end of each epoch
    disable_tqdm=True,
    log_level="error"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],  # Add validation dataset here
    data_collator=data_collator
)



In [27]:
trainer.train()

{'loss': 0.3762, 'grad_norm': 1.7764368057250977, 'learning_rate': 4.955516014234875e-05, 'epoch': 0.017793594306049824}
{'loss': 0.3848, 'grad_norm': 5.564195156097412, 'learning_rate': 4.911032028469751e-05, 'epoch': 0.03558718861209965}
{'loss': 0.3422, 'grad_norm': 3.934344530105591, 'learning_rate': 4.8665480427046265e-05, 'epoch': 0.05338078291814947}
{'loss': 0.122, 'grad_norm': 4.667408466339111, 'learning_rate': 4.822064056939502e-05, 'epoch': 0.0711743772241993}
{'loss': 0.0758, 'grad_norm': 2.1582083702087402, 'learning_rate': 4.777580071174377e-05, 'epoch': 0.08896797153024912}
{'loss': 0.0711, 'grad_norm': 0.2822377383708954, 'learning_rate': 4.733096085409253e-05, 'epoch': 0.10676156583629894}
{'loss': 0.077, 'grad_norm': 2.163625478744507, 'learning_rate': 4.6886120996441285e-05, 'epoch': 0.12455516014234876}
{'loss': 0.0388, 'grad_norm': 0.20367389917373657, 'learning_rate': 4.644128113879004e-05, 'epoch': 0.1423487544483986}
{'loss': 0.0681, 'grad_norm': 0.208330139517

TrainOutput(global_step=1124, training_loss=0.050312633904151194, metrics={'train_runtime': 159.6917, 'train_samples_per_second': 112.617, 'train_steps_per_second': 7.039, 'train_loss': 0.050312633904151194, 'epoch': 2.0})

In [None]:
# model.save_pretrained("bert_model")
# tokenizer.save_pretrained("bert_model")