In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import re
from tqdm import tqdm

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spotify_dataset.csv')

# Display the first few rows to understand its structure
print(df.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [3]:
# Step 3: Fine-tune a BERT model for sequence classification
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [4]:
# Preprocess textual data
def preprocess_text(df, text_columns):
    for col in text_columns:
        df[col] = df[col].fillna('unknown')  # Fill missing values with 'unknown'
        df[col] = df[col].astype(str).str.lower()  # Convert to lowercase and ensure string
        df[col] = df[col].apply(lambda x: re.sub(r'[\W_]+', ' ', x))  # Remove special characters
    df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)
    return df

In [5]:
# Fill missing values with a placeholder string, such as 'Unknown'
df['track_genre'].fillna('Unknown', inplace=True)

# Check unique genres
unique_genres = df['track_genre'].unique()

# Create a mapping from genre to an integer
genre_to_int = {genre: i for i, genre in enumerate(unique_genres)}

# Add a new column to the dataframe with the encoded numerical values
df['track_genre_encoded'] = df['track_genre'].map(genre_to_int)

# Now df has an additional column 'track_genre_encoded' with numerical values representing the genres
print(df[['track_genre', 'track_genre_encoded']].head())

  track_genre  track_genre_encoded
0    acoustic                    0
1    acoustic                    0
2    acoustic                    0
3    acoustic                    0
4    acoustic                    0


In [6]:
genre_labels = {genre: idx for idx, genre in enumerate(df['track_genre'].unique())}

In [7]:
genre_labels

{'acoustic': 0,
 'afrobeat': 1,
 'alt-rock': 2,
 'alternative': 3,
 'ambient': 4,
 'anime': 5,
 'black-metal': 6,
 'bluegrass': 7,
 'blues': 8,
 'brazil': 9,
 'breakbeat': 10,
 'british': 11,
 'cantopop': 12,
 'chicago-house': 13,
 'children': 14,
 'chill': 15,
 'classical': 16,
 'club': 17,
 'comedy': 18,
 'country': 19,
 'dance': 20,
 'dancehall': 21,
 'death-metal': 22,
 'deep-house': 23,
 'detroit-techno': 24,
 'disco': 25,
 'disney': 26,
 'drum-and-bass': 27,
 'dub': 28,
 'dubstep': 29,
 'edm': 30,
 'electro': 31,
 'electronic': 32,
 'emo': 33,
 'folk': 34,
 'forro': 35,
 'french': 36,
 'funk': 37,
 'garage': 38,
 'german': 39,
 'gospel': 40,
 'goth': 41,
 'grindcore': 42,
 'groove': 43,
 'grunge': 44,
 'guitar': 45,
 'happy': 46,
 'hard-rock': 47,
 'hardcore': 48,
 'hardstyle': 49,
 'heavy-metal': 50,
 'hip-hop': 51,
 'honky-tonk': 52,
 'house': 53,
 'idm': 54,
 'indian': 55,
 'indie-pop': 56,
 'indie': 57,
 'industrial': 58,
 'iranian': 59,
 'j-dance': 60,
 'j-idol': 61,
 'j-pop

In [8]:
text_columns = ['artists', 'album_name', 'track_name']
data = preprocess_text(df, text_columns)

# Preprocess numerical features
numerical_columns = ['danceability', 'energy', 'loudness', 'speechiness',
                     'acousticness', 'liveness', 'instrumentalness', 'valence', 'tempo']

scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Split data into train and validation sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['combined_text'], df['track_genre_encoded'], test_size=0.2)

In [9]:
type(data['combined_text'])

pandas.core.series.Series

In [10]:
input_text_list = ["".join(map(str, row)) for row in data['combined_text'].values]

In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

class SongDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # You might use 'track_genre' as labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')    
    

labels = df['track_genre_encoded']  # Convert genres to numerical labels

encodings = tokenizer(input_text_list, truncation=True, padding=True, max_length=512)
dataset = SongDataset(encodings, labels)

# Fine-tune BERT (simplified example)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [12]:
# Factorize genres and keep the unique genre names
labels, unique_genres = df['track_genre'].factorize()

# Create a mapping from numerical labels back to genre names
genre_mapping = {index: genre for index, genre in enumerate(unique_genres)}

In [None]:
# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

# Create a DataLoader for both training and validation sets
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(genre_to_int))

import torch

# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Now you can send the model to the specified device
model.to(device)


model.to(device)  # Send the model to the GPU if available

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the learning rate scheduler
epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader) * epochs)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss, total_accuracy = 0, 0
    
    # Train the data for one epoch
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    
    # Validation loop
    model.eval()
    total_eval_accuracy = 0
    
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():        
            outputs = model(**batch)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = batch['labels'].to('cpu').numpy()
        
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Training loss: {avg_train_loss}")
    print(f"Validation accuracy: {avg_val_accuracy}")


In [13]:
# Load the saved model state dictionary with mapping to CPU
model.load_state_dict(torch.load('mymodel1', map_location=torch.device('cpu')), strict=False)

_IncompatibleKeys(missing_keys=['bert.embeddings.position_ids'], unexpected_keys=[])

In [14]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
def recommend_song(query):
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    
    
    genre_predicted = genre_mapping[predictions.item()]
    
    # Filter the DataFrame for the predicted genre and choose the top 5 popular songs
    recommended_songs = df[df['track_genre'] == genre_predicted].sort_values(by='popularity', ascending=False).head(5)
    
    # Return the top 5 songs as a list of tuples (track_name, artist)
    return [(row['track_name'], row['artists']) for index, row in recommended_songs.iterrows()]

# Interactive loop to get recommendations
while True:
    query = input("Enter your music preference or 'exit' to quit: ")
    if query.lower() == 'exit':
        break
    
    top_songs = recommend_song(query)
    print("\nHere are some top 5 recommendations for you:")
    for i, (song, artist) in enumerate(top_songs, start=1):
        #print(f"{i}. {song} by {artist}")
        print(f"{i}. \033[1m{song}\033[0m by {artist}\n")



Enter your music preference or 'exit' to quit: pop songs

Here are some top 5 recommendations for you:
1. [1mi wanna be yours[0m by arctic monkeys

2. [1mbillie eilish [0m by armani white

3. [1mi love you so[0m by the walters

4. [1mdo i wanna know [0m by arctic monkeys

5. [1m505[0m by arctic monkeys

Enter your music preference or 'exit' to quit: pop songs of billie eilish

Here are some top 5 recommendations for you:
1. [1mone kiss with dua lipa [0m by calvin harris dua lipa

2. [1mnumb[0m by marshmello khalid

3. [1mbad decisions with bts snoop dogg [0m by benny blanco bts snoop dogg

4. [1mbelly dancer[0m by imanbek byor

5. [1meverything i wanted[0m by billie eilish

