DATA LOADING AND CLEANING

In [1]:
import numpy as np
import pandas as pd
import zipfile

In [2]:
zip_path = "D:/LLM_Based_Recommendation_System/music.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())

    csv_filename = z.namelist()[0]
    df = pd.read_csv(z.open(csv_filename))

['dataset.csv']


In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic


In [4]:
missing_count = df.isna().sum()
missing_percent = (df.isna().sum() / len(df)) * 100

missing_summary = pd.DataFrame({
    'missing_count': missing_count,
    'missing_percent': missing_percent
})

print(missing_summary)

                  missing_count  missing_percent
Unnamed: 0                    0         0.000000
track_id                      0         0.000000
artists                       1         0.000877
album_name                    1         0.000877
track_name                    1         0.000877
popularity                    0         0.000000
duration_ms                   0         0.000000
explicit                      0         0.000000
danceability                  0         0.000000
energy                        0         0.000000
key                           0         0.000000
loudness                      0         0.000000
mode                          0         0.000000
speechiness                   0         0.000000
acousticness                  0         0.000000
instrumentalness              0         0.000000
liveness                      0         0.000000
valence                       0         0.000000
tempo                         0         0.000000
time_signature      

In [5]:
# Drop rows with any empty ("") or NaN values in any column
df = df.replace("", np.nan)  # Convert empty strings to NaN
df = df.dropna()             # Drop rows with any NaN values

In [6]:
# Remove rows where 'id' occurs more than once
id_counts = df['track_id'].value_counts()
df = df[~df['track_id'].isin(id_counts[id_counts > 1].index)]
df['track_id'].value_counts()

track_id
2hETkH7cOfqmz3LqZDHZf5    1
1iJBSr7s7jYXzM8EGcbK5b    1
6lfxq3CG4xtTiEg7opyCyx    1
5vjLSffimiIP26QG5WcN2K    1
0IktbUcnAGrvD03AWnz3Q8    1
                         ..
2sYFi9xVSZ56WHKSY2fN1K    1
7lLKxcNeJtDTWVRKHovLEC    1
25UzeaV47eDT44Fovve6xQ    1
4oa14QBfWRDfJy2agySy0L    1
2gRKq9rIC5i1zuxp06zJWH    1
Name: count, Length: 73099, dtype: int64

In [7]:
df.isna().sum()

Unnamed: 0          0
track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [8]:
df = df.head(10000)

In [9]:
# import pinecone
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv

# Get API key
load_dotenv()  # Loads variables from .env into environment
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

# Initialize Pinecone
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "music-list"

# Create an index if it doesn't exist
if index_name not in pc.list_indexes():
    pc.create_index(index_name, dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) )  # Adjust based on embedding model

# Connect to the index
index = pc.Index(index_name)

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Small & fast model


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def format_track_data(row):
    """Formats music track data into a string for embedding."""
    return (f"Track Name: {row['track_name']}\n"
            f"Artist(s): {row['artists']}\n"
            f"Album Name: {row['album_name']}\n"
            f"Genre: {row['track_genre']}\n"
            f"Popularity: {row['popularity']}\n"
            f"Explicit: {'Yes' if row['explicit'] else 'No'}\n"
            f"Duration (ms): {row['duration_ms']}\n"
            f"Danceability: {row['danceability']}\n"
            f"Energy: {row['energy']}\n"
            f"Loudness: {row['loudness']}\n"
            f"Speechiness: {row['speechiness']}\n"
            f"Acousticness: {row['acousticness']}\n"
            f"Instrumentalness: {row['instrumentalness']}\n"
            f"Liveness: {row['liveness']}\n"
            f"Valence: {row['valence']}\n"
            f"Tempo: {row['tempo']}\n"
            f"Key: {row['key']}\n"
            f"Mode: {row['mode']}\n"
            f"Time Signature: {row['time_signature']}"
           )

# Convert each track into an embedding and store in Pinecone
for idx, row in df.iterrows():
    text_data = format_track_data(row)
    embedding = embedding_model.encode(text_data).tolist()  # Convert to list

    metadata = {
        "track_name": row["track_name"],
        "artists": row["artists"],
        "album_name": row["album_name"],
        "track_genre": row["track_genre"],
        "popularity": row["popularity"],
        "explicit": row["explicit"],
        "duration_ms": row["duration_ms"],
    }

    # Use a unique ID like `track_id`
    index.upsert([(str(row['track_id']), embedding, metadata)])


ProtocolError: Failed to connect; did you specify the correct index name?