In [1]:
import torch
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv("Final_cleaned_genres_output.csv")

# Keep only the necessary columns
data = data[['CleanedGenre', 'Lyrics', 'Label']]

data = data.dropna(subset=['CleanedGenre', 'Lyrics'])

label_mapping = {'Positive': 0, 'Negative': 1, 'Neutral': 2}
data['label'] = data['Label'].map(label_mapping)

In [3]:
train_genres, test_genres, train_lyrics, test_lyrics, train_labels, test_labels = train_test_split(  
    data['CleanedGenre'], data['Lyrics'], data['label'],   
    test_size=0.2, random_state=42  
) 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_pair(genres, lyrics):  
    encodings = tokenizer(  
        list(genres),  
        list(lyrics),  
        padding=True,  
        truncation=True,  
        max_length=512,  
        return_token_type_ids=True 
    )  
    return encodings  

train_encodings = encode_pair(train_genres, train_lyrics)  
test_encodings = encode_pair(test_genres, test_lyrics)  

train_dataset = Dataset.from_dict({  
    'input_ids': train_encodings['input_ids'],  
    'attention_mask': train_encodings['attention_mask'],  
    'token_type_ids': train_encodings['token_type_ids'],  
    'label': train_labels  
})  

test_dataset = Dataset.from_dict({  
    'input_ids': test_encodings['input_ids'],  
    'attention_mask': test_encodings['attention_mask'],  
    'token_type_ids': test_encodings['token_type_ids'],  
    'label': test_labels  
})  



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [4]:
# Compute class weights for balanced training
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)

class_weights_dict = dict(zip(range(len(class_weights)), class_weights))

# Convert to tensor
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class_weights_tensor = torch.FloatTensor(class_weights).to(device)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=3, problem_type="single_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Calculate loss with class weights
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [7]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,  
    max_steps=200,                    
    per_device_eval_batch_size=16,
    warmup_steps=20,                  # Reduced warmup steps (10% of max_steps)  
    weight_decay=0.01,  
    logging_dir='./logs',  
    logging_steps=10,  
    evaluation_strategy="steps",      
    eval_steps=50,                    # Evaluate every 50 steps  
    save_strategy="steps",  
    save_steps=50,                    # Save every 50 steps  
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss",  
    greater_is_better=False,  
    learning_rate=2e-5,  
    lr_scheduler_type="cosine"  

)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)



In [8]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[early_stopping]
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
50,0.8548,0.687404
100,0.4707,0.562221
150,0.3756,0.603095
200,0.7423,0.673572


TrainOutput(global_step=200, training_loss=0.676328970193863, metrics={'train_runtime': 621.5622, 'train_samples_per_second': 5.148, 'train_steps_per_second': 0.322, 'total_flos': 827228585336832.0, 'train_loss': 0.676328970193863, 'epoch': 4.651162790697675})

In [9]:
results = trainer.evaluate()

# Print the evaluation results
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.5622211694717407, 'eval_runtime': 15.1922, 'eval_samples_per_second': 11.124, 'eval_steps_per_second': 0.724, 'epoch': 4.651162790697675}


In [11]:
# Save the trained model and tokenizer
output_dir = './results/content/mymodel2'

# Save the model
trainer.save_model(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)



('./results/content/mymodel2\\tokenizer_config.json',
 './results/content/mymodel2\\special_tokens_map.json',
 './results/content/mymodel2\\vocab.txt',
 './results/content/mymodel2\\added_tokens.json')

In [12]:
def predict_sentiment(genre, lyrics, model, tokenizer):  
    # Tokenize genre and lyrics as separate inputs  
    inputs = tokenizer(  
        genre,  
        lyrics,  
        return_tensors="pt",  
        padding=True,  
        truncation=True,  
        max_length=512,  
        return_token_type_ids=True  # Important for distinguishing genre and lyrics  
    )  
    with torch.no_grad():  
        outputs = model(**inputs)  

    predicted_class = torch.argmax(outputs.logits, dim=1).item()  

    label_mapping = {0: "Positive", 1: "Negative", 2: "Neutral"}  

    predicted_label = label_mapping[predicted_class]  
    return predicted_label  


In [15]:

# Load the fine-tuned BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('results/content/mymodel2')  # or the path where your fine-tuned model is saved
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Input for testing (example)
genre_input = "Pop"  # Example genre
# lyrics_input = "I've been reading books of old	The legends and the myths	Achilles and his gold	Hercules and his gifts	Spiderman's control	And Batman with his fists	And clearly I don't see myself upon that listBut she said, where'd you wanna go?	How much you wanna risk?	I'm not looking for somebody	With some superhuman gifts	Some superhero	Some fairytale bliss	Just something I can turn to	Somebody I can kissI want something just like this	Doo-doo-doo, doo-doo-doo	Doo-doo-doo, doo-doo	Doo-doo-doo, doo-doo-doo	Oh, I want something just like this	Doo-doo-doo, doo-doo-doo	Doo-doo-doo, doo-doo	Doo-doo-doo, doo-doo-dooOh, I want something just like this	I want something just like thisI've been reading books of old	The legends and the myths	The testaments they told	The moon and its eclipse	And Superman unrolls	A suit before he lifts	But I'm not the kind of person that it fitsShe said, where'd you wanna go?	How much you wanna risk?	I'm not looking for somebody	With some superhuman gifts	Some superhero	Some fairytale bliss	Just something I can turn to	Somebody I can missI want something just like this	I want something just like thisOh, I want something just like this	Doo-doo-doo, doo-doo-doo	Doo-doo-doo, doo-doo	Doo-doo-doo, doo-doo-doo	Oh, I want something just like this	Doo-doo-doo, doo-doo-doo	Doo-doo-doo, doo-doo	Doo-doo-doo, doo-doo-dooWhere'd you wanna go?	How much you wanna risk?	I'm not looking for somebody	With some superhuman gifts	Some superhero	Some fairytale bliss	Just something I can turn to	Somebody I can kiss	I want something just like thisOh, I want something just like this	Oh, I want something just like this	Oh, I want something just like this"   # Example lyrics

lyrics_input ="no converse fake part bitches independent bitches part want paper part bitches flavored part part part part Ayy! part Bang shit hood one time Lil bitch back popping Tell ugly bitch move away need options Broke? fix pockets profit Quarter million switching part Bet bitch move old part 405 gun part Ayy still trona make plate Rich poor night choose fate Style top style night Five years rich night Drove Beamers Fig night Pushed Porsches Broadway dogging different hoes night Got chain worth Rolls night Got engine back top Nigga driving like bomb no converse fake part bitches independent bitches part want paper part bitches flavored part part part part Okay okay okay okay okay okay (That part Beggars cannot choosers bitch not Chipotle Nigga attitude feel like O'Shea Walkin' living legend man feel like Kobe left strip club got glitter Wifey going kill female OJ not feel man not okay Four Seasons take shower new clothes reloaded Rich night still eating catfish bitch not really bad catfish walk Saks Fifth paparazzi backflows lay mattress Blow back til backless Thick already established got done Yeah! Okay okay okay okay (That part Beggars cannot choosers bitch not Chipotle (That part Nigga attitude feel like O'Shea (That part Walkin' living legend man feel like Kobe (That part no converse fake part bitches independent bitches part want paper part bitches flavored part part part part Ayy! part Bang shit hood one time Lil bitch back popping Tell ugly bitch move away need options Broke? fix pockets profit million made still not part girl got matching part get slowed lose part XO thing go straight Need bitch go ways Style top style night Since lounging wanted ball night pistol drawls night broke sauce night Got Chevy side side Hundred spokes data dates Got chopper stand put homies beside no converse fake part bitches independent bitches part want paper part bitches flavored part part part part Ayy! part part part Walkin' living legend man feel like Kobe (That part dropped 60 man feel like Kobe Lamar man feel like Kobe Pippen wedding man feel like Jordan Trippin' wedding not say shit night listening close though listening hoes though would not listen flow though Listen Goat Listen young night 'Go though freestyle knew nights night ScHoolboy Q uh Top Dawg call Top Dawg Get night phone Top Dawg phone Ayy Hah!"
# Predict the sentiment label
predicted_label = predict_sentiment(genre_input, lyrics_input, model, tokenizer)

# Print the predicted label
print(f"Predicted Label: {predicted_label}")

Predicted Label: Negative
