In [1]:
# Imports
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
from datasets import Dataset, DatasetDict
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

In [2]:
# Import data and clean 
# Data is from https://www.kaggle.com/datasets/arshkandroo/behavioural-tweets?select=Lonely_Tweets.csv.
# Tweepy API was used to scrape twitter for lonely and normal tweets. The quality is okay but 
# some misclassification. The tweets are already cleaned using the NLTK library according to the
# data collectors.

# The downloaded data from Kaggle was saved in a corrupted way with both csv.xls endings.
# I used os to save them as pure csv files first. The files also contained bad first (index) 
# columns, requiring different ways of dropping these columns.

df_normal = pd.read_csv('Data/Normal_Tweets.csv', index_col=0)
df_normal.columns.values[0] = 'tweet'
df_normal['label'] = 0                                  # 0 for normal tweets 

df_lonely = pd.read_csv('Data/Lonely_Tweets.csv', usecols=lambda column: column != 'Unnamed: 0')
df_lonely.columns.values[0] = 'tweet'
df_lonely['label'] = 1                                  # 1 for lonely tweets 

In [3]:
# Create a crudely filtered dataset. The quality of the uploaded data is low
# and this creates large, fluctuating losses for the model training later. 

keywords = ["alone", "lonely", "no one"]
filtered_df = df_lonely[df_lonely['tweet'].str.contains('|'.join(keywords), case=False, na=False)]
df = pd.concat([df_normal, filtered_df])

In [4]:
# Display summary data
print(f"The data set has {len(df)} tweets in it.")
print()

print(f"There are {df.shape[1]} variables and they are as follows: ")
print(df.dtypes)
print()

print("The first and last five rows of the table are:")
pd.concat([df.head(), df.tail()])


The data set has 11240 tweets in it.

There are 2 variables and they are as follows: 
tweet    object
label     int64
dtype: object

The first and last five rows of the table are:


Unnamed: 0,tweet,label
0,remember hillary email non secure server,0
1,cant avoid demon,0
2,por fin la pusieron en spotify losing way de f...,0
3,kills,0
4,thank introduce important gunsense law make co...,0
8504,love realize day dont want bother nothing wron...,1
8507,hi kerry believe alone please know thousand pe...,1
8513,even though mean gotta let go yeah dependent y...,1
8514,claire please message ever need talk lonely,1
8519,lrt leave alone finally found need cry love ha...,1


In [5]:
# Transform data intoa a dataset file and spplit into 
# training and validation set

dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns(['__index_level_0__'])
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

final_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': test_valid_split['test'],
    'validation': test_valid_split['train']
})

In [6]:
# Initialize tokenizer and model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Tokenize the dataset

def tokenize_function(examples):
    return tokenizer(examples['tweet'], truncation=True, padding='max_length', max_length=20)

tokenized_dataset = final_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/8992 [00:00<?, ? examples/s]

Map:   0%|          | 0/1124 [00:00<?, ? examples/s]

Map:   0%|          | 0/1124 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

os.makedirs('./logs', exist_ok=True)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    log_level="error"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'], 
    data_collator=data_collator
)

In [9]:
trainer.train()

  0%|          | 0/1124 [00:00<?, ?it/s]

{'loss': 0.3562, 'grad_norm': 1.5983589887619019, 'learning_rate': 4.955516014234875e-05, 'epoch': 0.02}
{'loss': 0.3894, 'grad_norm': 2.7363717555999756, 'learning_rate': 4.911032028469751e-05, 'epoch': 0.04}
{'loss': 0.3611, 'grad_norm': 4.005592346191406, 'learning_rate': 4.8665480427046265e-05, 'epoch': 0.05}
{'loss': 0.2566, 'grad_norm': 6.693459987640381, 'learning_rate': 4.822064056939502e-05, 'epoch': 0.07}
{'loss': 0.1117, 'grad_norm': 2.9570164680480957, 'learning_rate': 4.777580071174377e-05, 'epoch': 0.09}
{'loss': 0.0795, 'grad_norm': 0.28621765971183777, 'learning_rate': 4.733096085409253e-05, 'epoch': 0.11}
{'loss': 0.0449, 'grad_norm': 7.924477577209473, 'learning_rate': 4.6886120996441285e-05, 'epoch': 0.12}
{'loss': 0.045, 'grad_norm': 0.09313103556632996, 'learning_rate': 4.644128113879004e-05, 'epoch': 0.14}
{'loss': 0.0411, 'grad_norm': 0.09053602814674377, 'learning_rate': 4.599644128113879e-05, 'epoch': 0.16}
{'loss': 0.1458, 'grad_norm': 0.2764173448085785, 'lea

  0%|          | 0/141 [00:00<?, ?it/s]

{'eval_loss': 0.032578952610492706, 'eval_runtime': 8.3741, 'eval_samples_per_second': 134.224, 'eval_steps_per_second': 16.838, 'epoch': 1.0}
{'loss': 0.0368, 'grad_norm': 0.04511674866080284, 'learning_rate': 2.4644128113879006e-05, 'epoch': 1.01}
{'loss': 0.0817, 'grad_norm': 0.024781858548521996, 'learning_rate': 2.419928825622776e-05, 'epoch': 1.03}
{'loss': 0.0022, 'grad_norm': 0.028053469955921173, 'learning_rate': 2.3754448398576516e-05, 'epoch': 1.05}
{'loss': 0.0015, 'grad_norm': 0.04837877303361893, 'learning_rate': 2.330960854092527e-05, 'epoch': 1.07}
{'loss': 0.0335, 'grad_norm': 0.04722357913851738, 'learning_rate': 2.2864768683274025e-05, 'epoch': 1.09}
{'loss': 0.001, 'grad_norm': 0.0251116082072258, 'learning_rate': 2.2419928825622775e-05, 'epoch': 1.1}
{'loss': 0.0307, 'grad_norm': 0.01736677810549736, 'learning_rate': 2.197508896797153e-05, 'epoch': 1.12}
{'loss': 0.0013, 'grad_norm': 0.0642259418964386, 'learning_rate': 2.1530249110320285e-05, 'epoch': 1.14}
{'loss

  0%|          | 0/141 [00:00<?, ?it/s]

{'eval_loss': 0.030920151621103287, 'eval_runtime': 8.9155, 'eval_samples_per_second': 126.072, 'eval_steps_per_second': 15.815, 'epoch': 2.0}
{'train_runtime': 501.1179, 'train_samples_per_second': 35.888, 'train_steps_per_second': 2.243, 'train_loss': 0.056678998426557436, 'epoch': 2.0}


TrainOutput(global_step=1124, training_loss=0.056678998426557436, metrics={'train_runtime': 501.1179, 'train_samples_per_second': 35.888, 'train_steps_per_second': 2.243, 'total_flos': 184835516390400.0, 'train_loss': 0.056678998426557436, 'epoch': 2.0})

In [10]:
model.save_pretrained("bert_model")
tokenizer.save_pretrained("bert_model")

('bert_model/tokenizer_config.json',
 'bert_model/special_tokens_map.json',
 'bert_model/vocab.txt',
 'bert_model/added_tokens.json')

In [34]:
# Classify an example tweet

classifier = pipeline("text-classification", model="./bert_model", tokenizer="./bert_model")
tweet = "bananarama is lonely"
result = classifier(tweet)
label = result[0]['label']
print(f"The model classified the tweet as: {'not lonely' if label == 'LABEL_0' else 'lonely'}")

The model classified the tweet as: lonely
