<a href="https://colab.research.google.com/github/sbarreto10/data-science-2022/blob/main/SPOTIFY%20DATASET%20(TP3)/75_06_TP_3_MULTILABEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import OneHotEncoder
pd.options.mode.chained_assignment = None
!pip install scikit-multilearn

import string
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **LECTURA**

In [None]:
trainDf = pd.read_parquet("/content/drive/MyDrive/tp3/train.parquet")
testDf = pd.read_parquet("/content/drive/MyDrive/tp3/test.parquet")

In [None]:
trainDf["genre"][trainDf[trainDf.genre == "Children's Music"].index[0]] = "Children’s Music"

In [None]:
def multi_label(df, fitting=[]):
    grp = df[["track_name","artist","genre"]]
    grp["1"] = 1
    grp["id"] = grp.apply(lambda x: (x.track_name, x.artist), axis=1)
    piv = grp.pivot_table(columns="genre", index="id", values="1", fill_value=0)
    for g in fitting:
        if g not in piv.columns:
            piv[g] = 0
    df["id"] = df.apply(lambda x: (x.track_name, x.artist), axis=1)
    df = df.drop_duplicates(subset=["id"], keep='first')

    return df.merge(piv,on="id"), piv.columns

In [None]:
trainDf, trainGenres = multi_label(trainDf)
testDf, genreList = multi_label(testDf, fitting=trainGenres)
genreList = genreList.tolist()

# **SPLIT**

In [None]:
random.seed(3)
artistList = trainDf.artist.unique().tolist()
validationArtists = random.sample(artistList, int(0.2*len(artistList)))
trainDf, valDf = trainDf.query("artist not in @validationArtists"), trainDf.query("artist in @validationArtists")

# **PREPROCESAMIENTO**

In [None]:
stopwordsSpEn = set(stopwords.words('english')+stopwords.words('spanish'))
sPunctuations = list(string.punctuation)
sDigits = list(string.digits)

In [None]:
def str_type_count(x, tset):
    words = x.split()
    return len([s for s in words if s in tset])

def digit_count(x):
    return len([d for d in x if d in sDigits])

def word_max(x):
    words = word_tokenize(x)
    return 0 if len(words)==0 else max([len(w) for w in words])

def word_min(x):
    words = word_tokenize(x)
    return 0 if len(words)==0 else min([len(w) for w in words if w not in stopwordsSpEn])

def word_mean(x):
    words = word_tokenize(x)
    return 0 if len(words)==0 else np.mean([len(w) for w in words])

def sent_max(x):
    sents = sent_tokenize(x)
    return 0 if len(sents)==0 else max([len(s) for s in sents])

def sent_min(x):
    sents = sent_tokenize(x)
    return 0 if len(sents)==0 else min([len(s) for s in sents])

def sent_mean(x):
    sents = sent_tokenize(x)
    return 0 if len(sents)==0 else np.mean([len(s) for s in sents])

def preprocess(df):
    # DROPEO
    df = df.drop(columns = ["id","track_name","did","artist","a_genres","a_songs"])

    # IMPUTACIÓN DE NULOS
    s_labelMean = df["s-label"].mean()
    df.lyric = df.lyric.fillna("").astype(str)
    df.language = df.language.map(lambda x: "ot" if type(x)==type(None) else x)
    df["s-label"] = df["s-label"].fillna(s_labelMean)
    df["mode"] = df["mode"].map(lambda m: int(m=="Major"))

    # CREACIÓN DE FEATURES A PARTIR DE LAS LYRICS
    df["lyricCharCount"] = df.lyric.map(len)
    df["lyricWordCount"] = df.lyric.map(lambda x: len(word_tokenize(x)))
    df["lyricUniqueWordCount"] = df.lyric.map(lambda x: len(set(x.split())))
    df["lyricSentenceCount"] = df.lyric.map(lambda x: len(sent_tokenize(x)))
    df["lyricUniqueSentenceCount"] = df.lyric.map(lambda x: len(set(sent_tokenize(x))))
    df["lyricDigitCount"] = df.lyric.map(digit_count)
    df["lyricStopwordCount"] = df.lyric.map(lambda x: str_type_count(x, stopwordsSpEn))
    df["lyricPunctuationCount"] = df.lyric.map(lambda x: str_type_count(x, sPunctuations))
    df["lyricLongestWordLen"] = df.lyric.map(word_max)
    df["lyricShortestWordLen"] = df.lyric.map(word_min)
    df["lyricWordLenMean"] = df.lyric.map(word_mean)
    df["lyricLongestSentenceLen"] = df.lyric.map(sent_max)
    df["lyricShortestSentenceLen"] = df.lyric.map(sent_min)
    df["lyricSentenceLenMean"] = df.lyric.map(sent_mean)

    #Mean encoding de a_popularity respecto de categóricas
    grpByLanguage = df.groupby(["language"]).mean()["a_popularity"]
    grpByKey = df.groupby(["key"]).mean()["a_popularity"]
    grpByTimeSignature = df.groupby(["time_signature"]).mean()["a_popularity"]
    grpByMode = df.groupby(["mode"]).mean()["a_popularity"]
    df["a_popularityMeanByLanguage"] = df["language"].map(lambda x: grpByLanguage[x])
    df["a_popularityMeanByKey"] = df["key"].map(lambda x: grpByKey[x])
    df["a_popularityMeanByTimeSignature"] = df["time_signature"].map(lambda x: grpByTimeSignature[x])
    df["a_popularityMeanByMode"] = df["mode"].map(lambda x: grpByMode[x])

    return df

In [None]:
trainDf = preprocess(trainDf)
valDf = preprocess(valDf)
testDf = preprocess(testDf)

In [None]:
def normalized(column):
    colStd = column.std()
    return (column - column.mean()) / colStd if colStd!=0 else 0 * column

def normalize_df(df, featureList):
    for f in featureList:
        df[f] = normalized(df[f])
    return df

def one_hot_encode(df, catCols):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(df[catCols])
    encFts = list(encoder.get_feature_names(catCols))
    df[encFts] = encoder.transform(df[catCols])
    return df

In [None]:
catCols = ["language","time_signature","key"]
trainDf = one_hot_encode(trainDf, catCols)
valDf = one_hot_encode(valDf, catCols)
testDf = one_hot_encode(testDf, catCols)



In [None]:
def fix_nan_lyrics(df):
    df.lyric = df.apply(lambda x: "" if type(x.lyric)!=str else x.lyric, axis=1)
    return df

In [None]:
trainDf = fix_nan_lyrics(trainDf)
valDf = fix_nan_lyrics(valDf)
testDf = fix_nan_lyrics(testDf)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countIDF = CountVectorizer(lowercase=True, stop_words=stopwordsSpEn, max_features=50)
countIDF.fit(trainDf.lyric)
ftNames = ["word"+w.capitalize() for w in countIDF.get_feature_names_out()]

for df in [trainDf, valDf, testDf]:
    wordMatrix = countIDF.transform(df.lyric)
    df[ftNames] = pd.DataFrame(wordMatrix.todense(), columns=ftNames, index=df.index)

In [None]:
featureList = [f for f in trainDf if f not in genreList+["id","genre","lyric","language","time_signature","key"]]

In [None]:
# HAY CATEGORÍAS EN EL TRAIN_DF QUE NO ESTÁN EN EL VAL_DF O EN EL TEST_DF
# COMO EL ONEHOTENCODING NO LAS CREO, LAS CREO COMO NULAS
for f in featureList:
    if f not in valDf:
        valDf[f] = 0
    if f not in testDf:
        testDf[f] = 0

In [None]:
trainDf = normalize_df(trainDf, featureList)
valDf = normalize_df(valDf, featureList)
testDf = normalize_df(testDf, featureList)

In [None]:
X_train, y_train = trainDf[featureList], trainDf[genreList]
X_test, y_test = testDf[featureList], testDf[genreList]
X_val, y_val = valDf[featureList], valDf[genreList]

#**ESTIMADOR MULTILABEL**

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
xgbEstimator = XGBClassifier(objective='binary:logistic', seed=10)
multilabelModel = MultiOutputClassifier(xgbEstimator)
multilabelModel.fit(X_train,y_train)

MultiOutputClassifier(estimator=XGBClassifier(seed=10))

In [None]:
#@title TUNEO DEL MODELO
hpar_space = [
    {
        "estimator": [xgbEstimator],
        "estimator__eta": np.arange(0, 0.5, 0.025),
        "estimator__max_depth": np.arange(3, 10),
        "estimator__n_estimators": np.arange(3, 15),
        "estimator__min_child_weight": np.arange(0, 10)
    },
]

search = RandomizedSearchCV(multilabelModel, hpar_space, cv=3, n_iter=12, scoring='accuracy', verbose=10, random_state=420)
result = search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3; 1/12] START estimator=XGBClassifier(seed=10), estimator__eta=0.325, estimator__max_depth=5, estimator__min_child_weight=2, estimator__n_estimators=4
[CV 1/3; 1/12] END estimator=XGBClassifier(seed=10), estimator__eta=0.325, estimator__max_depth=5, estimator__min_child_weight=2, estimator__n_estimators=4;, score=0.083 total time=   5.7s
[CV 2/3; 1/12] START estimator=XGBClassifier(seed=10), estimator__eta=0.325, estimator__max_depth=5, estimator__min_child_weight=2, estimator__n_estimators=4
[CV 2/3; 1/12] END estimator=XGBClassifier(seed=10), estimator__eta=0.325, estimator__max_depth=5, estimator__min_child_weight=2, estimator__n_estimators=4;, score=0.144 total time=   5.4s
[CV 3/3; 1/12] START estimator=XGBClassifier(seed=10), estimator__eta=0.325, estimator__max_depth=5, estimator__min_child_weight=2, estimator__n_estimators=4
[CV 3/3; 1/12] END estimator=XGBClassifier(seed=10), estimator__eta=0.325, estimator__m

In [None]:
multilabelModel = result.best_estimator_
multilabelModel.fit(X_train,y_train)

MultiOutputClassifier(estimator=XGBClassifier(eta=0.0, max_depth=9,
                                              min_child_weight=6,
                                              n_estimators=14, seed=10))

In [None]:
bestValScore = multilabelModel.score(X_val, y_val)
print("Mejor score de validación: " + str(bestValScore))

Mejor score de validación: 0.1653517422748192


In [None]:
bestTestScore = multilabelModel.score(X_test, y_test)
print("Score en test: " + str(bestTestScore))

Score en test: 0.15163398692810456
