In [2]:
#Install & Import Libraries
import pandas as pd

# Load only required columns
df = pd.read_csv(
    "/content/Spotify Million Song Dataset_exported.csv",
    usecols=["artist", "song", "text"]
)

# Drop missing lyrics
df.dropna(subset=["text"], inplace=True)

# Sample for faster training
df = df.sample(50000, random_state=42)

df.reset_index(drop=True, inplace=True)
print(df.shape)


(50000, 3)


In [3]:
# feature Extraction
from sklearn.preprocessing import LabelEncoder

artist_encoder = LabelEncoder()
df["artist_id"] = artist_encoder.fit_transform(df["artist"])

num_artists = df["artist_id"].nunique()
print("Number of artists:", num_artists)


Number of artists: 642


In [4]:
# Train & Testing data split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df["text"].astype(str).to_numpy(),
    df["artist_id"].to_numpy(),
    test_size=0.2,
    random_state=42
)


In [5]:
#model training using Tensorflow
import tensorflow as tf

max_tokens = 30000
sequence_length = 200

vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=sequence_length
)

# Adapt only on training data
vectorizer.adapt(X_train)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1,), dtype=tf.string),
    vectorizer,
    tf.keras.layers.Embedding(max_tokens, 64),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(num_artists, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [6]:
# Epoch wise training
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=128
)


Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.0031 - loss: 6.3251 - val_accuracy: 0.0059 - val_loss: 6.1178
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.0077 - loss: 6.0547 - val_accuracy: 0.0111 - val_loss: 5.9563
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.0148 - loss: 5.8361 - val_accuracy: 0.0230 - val_loss: 5.7785
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.0252 - loss: 5.6049 - val_accuracy: 0.0262 - val_loss: 5.6389
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.0363 - loss: 5.4092 - val_accuracy: 0.0343 - val_loss: 5.5553


In [13]:
# use TF-IDF matrix to predict artist and song
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf

# Build TF-IDF matrix for song retrieval (run once)
tfidf = TfidfVectorizer(max_features=20000, stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["text"])

def predict_artist_song():
    # Take user input
    lyrics = input("Enter lyrics snippet: ")

    # Wrap in a TensorFlow string tensor
    lyrics_tensor = tf.constant([lyrics])

    # Predict artist
    pred = model.predict(lyrics_tensor)
    artist_id = pred.argmax()
    artist = artist_encoder.inverse_transform([artist_id])[0]

    # Filter dataset to only that artist
    artist_df = df[df["artist"] == artist]

    # Predict song using TF-IDF similarity
    query_vec = tfidf.transform([lyrics])
    artist_vecs = tfidf_matrix[artist_df.index]
    sims = cosine_similarity(query_vec, artist_vecs)
    best_idx = artist_df.index[sims.argmax()]
    song = df.loc[best_idx, "song"]

    # Print results
    print("\nPredicted Artist:", artist)
    print("Predicted Song:", song)


In [22]:
#output
predict_artist_song()


Enter lyrics snippet: Look at her face it's a wonderful face

Predicted Artist: Foreigner
Predicted Song: Face To Face


In [23]:
# Use 1000 random samples from the dataset to test accuracy
test_df = df.sample(1000, random_state=42)


In [24]:
# Accuracy check Function
from sklearn.metrics import accuracy_score

def evaluate_model(test_df):
    y_true = test_df["artist"].tolist()
    y_pred = []

    for lyrics in test_df["text"]:
        # Transform input
        query_vec = tfidf.transform([lyrics])
        sims = cosine_similarity(query_vec, tfidf_matrix)
        best_idx = sims.argmax()
        pred_artist = df.loc[best_idx, "artist"]
        y_pred.append(pred_artist)

    acc = accuracy_score(y_true, y_pred)
    print(f"Artist Accuracy: {acc*100:.2f}%")


In [25]:
#model accuracy check
evaluate_model(test_df)


Artist Accuracy: 99.70%
