In [1]:
!pip install kaggle




In [2]:
import pandas as pd
import random
from google.colab import userdata
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [3]:
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
!kaggle datasets download -d carlosgdcj/genius-song-lyrics-with-language-information


Downloading genius-song-lyrics-with-language-information.zip to /content
100% 3.03G/3.04G [00:35<00:00, 149MB/s]
100% 3.04G/3.04G [00:36<00:00, 90.6MB/s]


In [4]:
!unzip genius-song-lyrics-with-language-information.zip


Archive:  genius-song-lyrics-with-language-information.zip
  inflating: song_lyrics.csv         


In [5]:
# n = 100 every 100th line = 1% of the lines 50 000 lines taken
df = pd.read_csv("song_lyrics.csv", skiprows=lambda i: i % 100 != 0)

In [22]:
# Filter for only music tagged data, in english, and we want only title, lyrics and tags for the moments ...
print(df.index)
df = df[df['tag'] != 'misc']
if 'language' in df.columns:
    df = df[df['language'] == 'en']
df = df[['title', 'lyrics', 'tag']]
df.reset_index(drop=True, inplace=True)
# To shuffle randomnly datas
df = df.sample(frac = 1)
print(df)
# Si besoin : convertion des types
# df.info()
# df["colonne"].dtype
# df["colonne"] = df["colonne"].astype(str)



Int64Index([24718, 12816, 20450,  4158,  7747, 30447, 27574, 30756, 11817,
            13665,
            ...
            20020, 21675,  4677,  7784, 22054, 26287, 13149, 13505,  1621,
            22415],
           dtype='int64', length=32100)
                        title  \
25840                    More   
2048        Cool and Bendable   
17666               1 by 1 XO   
13181         Cherokee Louise   
11559                Valhalla   
...                       ...   
19303           Up Like Trump   
22768           I Wont Get In   
18417        Travis Touchdown   
94     Trail Of Broken Hearts   
3068             Bonjojo Bars   

                                                  lyrics   tag  
25840  I just wanna talk about you. A little bit more...   pop  
2048   [Verse 1]\nI would like something special\nI w...   pop  
17666  Oh you think you got me\nCause my love will ne...   rap  
13181  [Verse 1]\nCherokee Louise is hiding in this t...  rock  
11559  Sleet, waves, lightning an

In [23]:

# Split the data into features (X) and labels (Y)
X = df['lyrics']
Y = df['tag']

# Split the data into training and test sets (80% training, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [24]:
# Define a StandardScaler
scaler = StandardScaler(with_mean=False)

# Create a pipeline with CountVectorizer, StandardScaler, and logistic regression
model = make_pipeline(CountVectorizer(ngram_range=(1, 1)), scaler, LogisticRegression(max_iter=3000))

# Train the model
model.fit(X_train, Y_train)

# Evaluate the model
accuracy = model.score(X_test, Y_test)
print("Accuracy:", accuracy)

Accuracy: 0.5404984423676013


In [53]:
from sklearn.metrics import accuracy_score
df_test = pd.read_csv("song_lyrics.csv", skiprows=lambda i: i % 977 != 0 , nrows=10) # Change here to test different values

df_test = df_test[df_test['tag'] != 'misc']
if 'language' in df_test.columns:
    df_test = df_test[df_test['language'] == 'en']
df_test = df_test[['title', 'lyrics', 'tag']]
df_test.reset_index(drop=True, inplace=True)

for song_name, song_lyrics, song_tag in zip(df_test['title'], df_test['lyrics'], df_test['tag']):
    print("Song:", song_name)
    print("Tag:", song_tag)
    # Convert the lyrics to a list and predict probabilities
    probabilities = model.predict_proba([song_lyrics])

    # Print the distribution of probabilities
    print("Distribution of Probabilities:")
    for class_label, probability in zip(model.classes_, probabilities[0]):
        if(probability > 0.0001):
          print(f"{class_label}: {probability:.4f}")
    max_prob_index = probabilities.argmax()
    predicted_class = model.classes_[max_prob_index]
    if predicted_class != song_tag:
        print(f'Model failed to predict. Actual tag is {song_tag}, predicted tag is {predicted_class}')
    print()


Song: Verbal Intercourse
tag: rap
Distribution of Probabilities:
rap: 1.0000

Song: Friends and Neighbors
tag: rap
Distribution of Probabilities:
pop: 0.8131
rap: 0.0821
rb: 0.1048
Model failed to predict. Actual tag is rap, predicted tag is pop

Song: The Rabbit Hole
tag: rap
Distribution of Probabilities:
pop: 0.9994
rap: 0.0006
Model failed to predict. Actual tag is rap, predicted tag is pop

Song: Ok Youre Right
tag: rap
Distribution of Probabilities:
pop: 0.0002
rap: 0.9982
rb: 0.0016

Song: Move Back
tag: rap
Distribution of Probabilities:
rap: 1.0000

Song: Beneath the Surface
tag: rap
Distribution of Probabilities:
rap: 1.0000

Song: 7A3 Will Rock You
tag: rap
Distribution of Probabilities:
rap: 1.0000

Song: Lights Out Partys Over
tag: rap
Distribution of Probabilities:
rap: 1.0000

Song: Its On All Day
tag: rap
Distribution of Probabilities:
pop: 0.9963
rap: 0.0037
Model failed to predict. Actual tag is rap, predicted tag is pop

Song: Cameo Afro
tag: rap
Distribution of Prob