<a href="https://colab.research.google.com/github/sbarreto10/data-science-2022/blob/main/SPOTIFY%20DATASET%20(TP3)/75_06_TP_3_ENSAMBLE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LECTURA**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
pd.options.mode.chained_assignment = None

import string
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
stopwordsSpEn = set(stopwords.words('english')+stopwords.words('spanish'))
sPunctuations = list(string.punctuation)
sDigits = list(string.digits)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
trainDf = pd.read_parquet("/content/drive/MyDrive/tp3/train.parquet")
testDf = pd.read_parquet("/content/drive/MyDrive/tp3/test.parquet")

# **SPLIT**

In [None]:
random.seed(3)
artistList = trainDf.artist.unique().tolist()
validationArtists = random.sample(artistList, int(0.2*len(artistList)))
trainDf, valDf = trainDf.query("artist not in @validationArtists"), trainDf.query("artist in @validationArtists")

# **PREPROCESAMIENTO**

In [None]:
genreList = trainDf["genre"].unique().tolist()

In [None]:
def str_type_count(x, tset):
    words = x.split()
    return len([s for s in words if s in tset])

def digit_count(x):
    return len([d for d in x if d in sDigits])

def word_max(x):
    words = word_tokenize(x)
    return 0 if len(words)==0 else max([len(w) for w in words])

def word_min(x):
    words = word_tokenize(x)
    return 0 if len(words)==0 else min([len(w) for w in words if w not in stopwordsSpEn])

def word_mean(x):
    words = word_tokenize(x)
    return 0 if len(words)==0 else np.mean([len(w) for w in words])

def sent_max(x):
    sents = sent_tokenize(x)
    return 0 if len(sents)==0 else max([len(s) for s in sents])

def sent_min(x):
    sents = sent_tokenize(x)
    return 0 if len(sents)==0 else min([len(s) for s in sents])

def sent_mean(x):
    sents = sent_tokenize(x)
    return 0 if len(sents)==0 else np.mean([len(s) for s in sents])

def preprocess(df):
    # ARREGLOS Y DROPEOS
    try:
        df["genre"][df[df.genre == "Children's Music"].index[0]] = "Children’s Music"
    except:
        pass
    df = df.drop(columns = ["track_name","did","artist","a_genres","a_songs"])

    # IMPUTACIÓN DE NULOS
    s_labelMean = df["s-label"].mean()
    df.lyric = df.lyric.fillna("").astype(str)
    df.language = df.language.map(lambda x: "ot" if type(x)==type(None) else x)
    df["s-label"] = df["s-label"].fillna(s_labelMean)
    df["mode"] = df["mode"].map(lambda m: int(m=="Major"))

    # CREACIÓN DE FEATURES A PARTIR DE LAS LYRICS
    df["lyricCharCount"] = df.lyric.map(len)
    df["lyricWordCount"] = df.lyric.map(lambda x: len(word_tokenize(x)))
    df["lyricUniqueWordCount"] = df.lyric.map(lambda x: len(set(x.split())))
    df["lyricSentenceCount"] = df.lyric.map(lambda x: len(sent_tokenize(x)))
    df["lyricUniqueSentenceCount"] = df.lyric.map(lambda x: len(set(sent_tokenize(x))))
    df["lyricDigitCount"] = df.lyric.map(digit_count)
    df["lyricStopwordCount"] = df.lyric.map(lambda x: str_type_count(x, stopwordsSpEn))
    df["lyricPunctuationCount"] = df.lyric.map(lambda x: str_type_count(x, sPunctuations))
    df["lyricLongestWordLen"] = df.lyric.map(word_max)
    df["lyricShortestWordLen"] = df.lyric.map(word_min)
    df["lyricWordLenMean"] = df.lyric.map(word_mean)
    df["lyricLongestSentenceLen"] = df.lyric.map(sent_max)
    df["lyricShortestSentenceLen"] = df.lyric.map(sent_min)
    df["lyricSentenceLenMean"] = df.lyric.map(sent_mean)

    #Mean encoding de a_popularity respecto de categóricas
    grpByLanguage = df.groupby(["language"]).mean()["a_popularity"]
    grpByKey = df.groupby(["key"]).mean()["a_popularity"]
    grpByTimeSignature = df.groupby(["time_signature"]).mean()["a_popularity"]
    grpByMode = df.groupby(["mode"]).mean()["a_popularity"]
    df["a_popularityMeanByLanguage"] = df["language"].map(lambda x: grpByLanguage[x])
    df["a_popularityMeanByKey"] = df["key"].map(lambda x: grpByKey[x])
    df["a_popularityMeanByTimeSignature"] = df["time_signature"].map(lambda x: grpByTimeSignature[x])
    df["a_popularityMeanByMode"] = df["mode"].map(lambda x: grpByMode[x])

    return df

In [None]:
trainDf = preprocess(trainDf)
valDf = preprocess(valDf)
testDf = preprocess(testDf)

# **ESTANDARIZACIÓN Y ENCODING**

In [None]:
def normalized(column):
    colStd = column.std()
    return (column - column.mean()) / colStd if colStd!=0 else 0 * column

In [None]:
def normalize_df(df, featureList):
    for f in featureList:
        df[f] = normalized(df[f])
    return df

In [None]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(df, catCols):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(df[catCols])
    encFts = list(encoder.get_feature_names(catCols))
    df[encFts] = encoder.transform(df[catCols])
    return df

In [None]:
catCols = ["language","time_signature","key"]
trainDf = one_hot_encode(trainDf, catCols)
valDf = one_hot_encode(valDf, catCols)
testDf = one_hot_encode(testDf, catCols)



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countIDF = CountVectorizer(lowercase=True, stop_words=stopwordsSpEn, max_features=50)
countIDF.fit(trainDf.lyric)
ftNames = ["word"+w.capitalize() for w in countIDF.get_feature_names_out()]

for df in [trainDf, valDf, testDf]:
    wordMatrix = countIDF.transform(df.lyric)
    df[ftNames] = pd.DataFrame(wordMatrix.todense(), columns=ftNames, index=df.index)

In [None]:
featureList = [f for f in trainDf if f not in ["genre","lyric","language","time_signature","key"]]

In [None]:
# HAY CATEGORÍAS EN EL TRAIN_DF QUE NO ESTÁN EN EL VAL_DF O EN EL TEST_DF
# COMO EL ONEHOTENCODING NO LAS CREO, LAS CREO COMO NULAS
for f in featureList:
    if f not in valDf:
        valDf[f] = 0
    if f not in testDf:
        testDf[f] = 0

In [None]:
trainDf = normalize_df(trainDf, featureList)
valDf = normalize_df(valDf, featureList)
testDf = normalize_df(testDf, featureList)

In [None]:
testDf = testDf.query("genre in @genreList")
valDf = valDf.query("genre in @genreList")

In [None]:
X_train, y_train = trainDf[featureList], trainDf.genre.astype(str)
X_test, y_test = testDf[featureList], testDf.genre.astype(str)
X_val, y_val = valDf[featureList], valDf.genre.astype(str)

# **MODELOS DE ML**

In [None]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import top_k_accuracy_score
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

In [None]:
xgbModel = XGBClassifier(eta=0.025, eval_metric='mlogloss', max_depth=9,
              min_child_weight=7, objective='multi:softprob', seed=10,
              subsample=0.825)
xgbModel.fit(X_train, y_train)

XGBClassifier(eta=0.025, eval_metric='mlogloss', max_depth=9,
              min_child_weight=7, objective='multi:softprob', seed=10,
              subsample=0.825)

In [None]:
# KNN precisa clases numéricas, así que indexo los géneros
classToNum = { g: genreList.index(g) for g in genreList }
y_train_num = y_train.map(lambda x: classToNum[x])
knnModel = KNeighborsClassifier(leaf_size=3, metric='manhattan', n_neighbors=13, p=1,
                     weights='distance')
knnModel.fit(X_train, y_train_num)

KNeighborsClassifier(leaf_size=3, metric='manhattan', n_neighbors=13, p=1,
                     weights='distance')

# **ENSAMBLE DE MODELOS**

In [None]:
xgbValidProbs = xgbModel.predict_proba(X_val)
knnValidProbs = knnModel.predict_proba(X_val)
ensembleValScore = top_k_accuracy_score(y_val, (xgbValidProbs+knnValidProbs)/2, k=2, labels=xgbModel.classes_)

In [None]:
print("Score del ensamble (validación): " + str(ensembleValScore))

Score del ensamble (validación): 0.43429844097995546


In [None]:
xgbTestProbs = xgbModel.predict_proba(X_test)
knnTestProbs = knnModel.predict_proba(X_test)
ensembleTestScore = top_k_accuracy_score(y_test, (xgbTestProbs+knnTestProbs)/2, k=2, labels=xgbModel.classes_)

In [None]:
print("Score del ensamble (test): " + str(ensembleTestScore))

Score del ensamble (test): 0.3957345971563981
