In [1]:
import pandas as pd
import re
import numpy as np
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tr21\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

  if not hasattr(np, "object"):


In [4]:
import joblib

In [5]:
STOPWORDS = set(stopwords.words("english"))

In [6]:
df = pd.read_csv("songs.csv")
df.columns = [c.strip().lower() for c in df.columns]
df = df[['artist','song title','lyric']].dropna()
df.rename(columns={'song title':'title'}, inplace=True)

In [7]:

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = [t for t in text.split() if t not in STOPWORDS and len(t) > 1]
    return " ".join(tokens)

In [8]:

df['lyric_clean'] = df['lyric'].apply(clean_text)
print("Data loaded and cleaned. Sample:")
print(df.head(2))

Data loaded and cleaned. Sample:
         artist              title  \
0  ...AAAARRGHH  _Gecenin_G__lgesi   
1  ...AAAARRGHH        _Son___afak   

                                               lyric  \
0  Kara bulutlar sardı yine dünyamı\r\nKış yerleş...   
1  Dolunay parlak görünmüyor bu gece\r\nBenim top...   

                                         lyric_clean  
0  kara bulutlar sard yine nyam yerle ti ruhuma n...  
1  dolunay parlak nm yor bu gece benim toprak dol...  


In [9]:
le = LabelEncoder()
df['artist_label'] = le.fit_transform(df['artist'])
num_classes = len(le.classes_)
print(f"Number of unique artists: {num_classes}")

Number of unique artists: 7457


In [10]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['lyric_clean'], df['artist_label'], test_size=0.2, random_state=42
)

In [11]:
MAX_WORDS = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train = tokenizer.texts_to_sequences(train_texts)
X_test  = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train, maxlen=MAX_LEN, padding='post')
X_test  = pad_sequences(X_test, maxlen=MAX_LEN, padding='post')

y_train = np.array(train_labels)
y_test  = np.array(test_labels)

print("Tokenization and padding done.")
print("X_train shape:", X_train.shape)

Tokenization and padding done.
X_train shape: (159993, 100)


In [12]:
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    LSTM(128),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [13]:
history = model.fit(X_train, y_train,
                    validation_split=0.1,
                    epochs=5,
                    batch_size=64)

print("Training complete.")

Epoch 1/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 148ms/step - accuracy: 0.0053 - loss: 8.3699 - val_accuracy: 0.0081 - val_loss: 8.0499
Epoch 2/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 148ms/step - accuracy: 0.0095 - loss: 7.8357 - val_accuracy: 0.0108 - val_loss: 7.7017
Epoch 3/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 148ms/step - accuracy: 0.0143 - loss: 7.4762 - val_accuracy: 0.0164 - val_loss: 7.4609
Epoch 4/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 149ms/step - accuracy: 0.0200 - loss: 7.1421 - val_accuracy: 0.0205 - val_loss: 7.2201
Epoch 5/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 149ms/step - accuracy: 0.0280 - loss: 6.8309 - val_accuracy: 0.0261 - val_loss: 7.0649
Training complete.


In [14]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy (Artist Prediction): {acc:.4f}")

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 27ms/step - accuracy: 0.0259 - loss: 7.0540
Test Accuracy (Artist Prediction): 0.0259


In [15]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=200000)
X_tfidf = tfidf_vectorizer.fit_transform(df['lyric_clean'])
df_meta = df[['artist','title']].reset_index(drop=True)
print("TF-IDF indexing done.")

TF-IDF indexing done.


In [16]:
def clean_input(snippet):
    snippet = str(snippet).lower()
    snippet = re.sub(r"[^a-z\s]", " ", snippet)
    snippet = re.sub(r"\s+", " ", snippet).strip()
    tokens = [t for t in snippet.split() if t not in STOPWORDS and len(t) > 1]
    return " ".join(tokens)


In [17]:
def predict_artist(snippet):
    snippet_clean = clean_input(snippet)
    seq = tokenizer.texts_to_sequences([snippet_clean])
    seq = pad_sequences(seq, maxlen=MAX_LEN, padding='post')
    pred = model.predict(seq, verbose=0)
    idx = np.argmax(pred)
    return le.inverse_transform([idx])[0]

In [18]:
def predict_artist_and_song(snippet):
    artist_pred = predict_artist(snippet)
    artist_songs_idx = df_meta[df_meta['artist'] == artist_pred].index
    if len(artist_songs_idx) == 0:
        return {"artist": artist_pred, "song": None}
    snippet_clean = clean_input(snippet)
    snippet_vec = tfidf_vectorizer.transform([snippet_clean])
    sims = cosine_similarity(snippet_vec, X_tfidf[artist_songs_idx]).ravel()
    best_idx = artist_songs_idx[sims.argmax()]
    return {
        "artist": df_meta.loc[best_idx, "artist"],
        "song": df_meta.loc[best_idx, "title"]
    }

In [19]:
def top_k_songs(snippet, k=5):
    artist_pred = predict_artist(snippet)
    artist_songs_idx = df_meta[df_meta['artist'] == artist_pred].index
    snippet_clean = clean_input(snippet)
    snippet_vec = tfidf_vectorizer.transform([snippet_clean])
    sims = cosine_similarity(snippet_vec, X_tfidf[artist_songs_idx]).ravel()
    top_idx = sims.argsort()[-k:][::-1]
    results = []
    for idx in top_idx:
        real_idx = artist_songs_idx[idx]
        results.append({
            "artist": df_meta.loc[real_idx,"artist"],
            "song": df_meta.loc[real_idx,"title"],
            "score": float(sims[idx])
        })
    return results

In [21]:
snippet = "hello from the other side i must have called a thousand times"
result = predict_artist_and_song(snippet)
print("\nSingle Prediction:")
print("Predicted Artist:", result["artist"])
print("Predicted Song:", result["song"])



Single Prediction:
Predicted Artist: UNHOLY GRAVE
Predicted Song: Extreme Stupidities
