In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from matplotlib.colors import ListedColormap
from sklearn import linear_model, metrics
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
from google.colab import drive
drive.mount('/content/drive')
dataset = pd.read_csv('/content/drive/My Drive/spotify_data_limpo.csv', encoding='latin1')

dataset.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_mins,popularity_category
0,1,Jason Mraz,93 Million Miles,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,4,3.60645,Alto
1,2,Joshua Hyslop,Do Not Let Me Go,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,4,2.649333,Alto
2,4,Andrew Belle,Sky's Still Blue,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,4,4.072,Alto
3,5,Chris Smither,What They Say,48,2012,acoustic,0.566,0.57,2,-6.42,1,0.0329,0.688,2e-06,0.0943,0.96,83.403,4,2.770667,Baixo
4,6,Matt Wertz,Walking in a Winter Wonderland,48,2012,acoustic,0.575,0.606,9,-8.197,1,0.03,0.0119,0.0,0.0675,0.364,121.083,4,2.53845,Baixo


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 648913 entries, 0 to 648912
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           648913 non-null  int64  
 1   artist_name          648911 non-null  object 
 2   track_name           648912 non-null  object 
 3   popularity           648913 non-null  int64  
 4   year                 648913 non-null  int64  
 5   genre                648913 non-null  object 
 6   danceability         648913 non-null  float64
 7   energy               648913 non-null  float64
 8   key                  648913 non-null  int64  
 9   loudness             648913 non-null  float64
 10  mode                 648913 non-null  int64  
 11  speechiness          648913 non-null  float64
 12  acousticness         648913 non-null  float64
 13  instrumentalness     648913 non-null  float64
 14  liveness             648913 non-null  float64
 15  valence          

In [4]:
# Definindo os valores de X e Y para o nosso modelo

X = dataset[['year', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_mins']].select_dtypes(include='number')
Y = dataset['popularity_category']

Y.describe()


Unnamed: 0,popularity_category
count,648913
unique,2
top,Baixo
freq,611736


In [5]:
# Normalizando o dataset de treino e realizando a separação entre dados de teste e dados de treino.

scaler = StandardScaler()
model_X = scaler.fit(X)
X_scaled = model_X.transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, random_state=42, test_size=0.3, shuffle=True)

print(len(Y_train))

454239


# Testando diferentes algoritmos

In [6]:
# Define o máximo de CPUS (Jobs) a serem usados nos modelos abaixo.
import multiprocessing
n_cpus = multiprocessing.cpu_count()
print(n_cpus)

2


In [7]:
# Mantém um dicionário de modelos e acurácia entre eles
model_with_accuracy = {}

## Regressão logística

In [8]:
lg = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000, n_jobs=n_cpus)
lg.fit(X_train, Y_train)

prediction = lg.predict(X_test)

accuracy = metrics.accuracy_score(Y_test, prediction) * 100
print("Acurácia do algoritmo de Regressão Logística (%)", accuracy)
print("Confusion matrix", confusion_matrix(Y_test, prediction))
print("Classification report", classification_report(Y_test, prediction))

model_with_accuracy['lg'] = {'model': lg, 'accuracy': accuracy}

Acurácia do algoritmo de Regressão Logística (%) 94.33309019180784
Confusion matrix [[     9  11005]
 [    27 183633]]
Classification report               precision    recall  f1-score   support

        Alto       0.25      0.00      0.00     11014
       Baixo       0.94      1.00      0.97    183660

    accuracy                           0.94    194674
   macro avg       0.60      0.50      0.49    194674
weighted avg       0.90      0.94      0.92    194674



## KNN

In [9]:
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=n_cpus)
knn.fit(X_train, Y_train)
prediction = knn.predict(X_test)

accuracy = metrics.accuracy_score(Y_test, prediction) * 100
print("Acurácia do algoritmo de Regressão Logística (%)", accuracy)
print("Confusion matrix", confusion_matrix(Y_test, prediction))
print("Classification report", classification_report(Y_test, prediction))

model_with_accuracy['knn'] =  {'model': knn, 'accuracy': accuracy}

Acurácia do algoritmo de Regressão Logística (%) 93.12440284783793
Confusion matrix [[   888  10126]
 [  3259 180401]]
Classification report               precision    recall  f1-score   support

        Alto       0.21      0.08      0.12     11014
       Baixo       0.95      0.98      0.96    183660

    accuracy                           0.93    194674
   macro avg       0.58      0.53      0.54    194674
weighted avg       0.91      0.93      0.92    194674



## Random Forest

In [None]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=n_cpus)
rfc.fit(X_train, Y_train)

prediction = rfc.predict(X_test)

accuracy = metrics.accuracy_score(Y_test, prediction) * 100
print("Acurácia do algoritmo de Regressão Logística (%)", accuracy)
print("Confusion matrix", confusion_matrix(Y_test, prediction))
print("Classification report", classification_report(Y_test, prediction))

model_with_accuracy['rfc'] =  {'model': rfc, 'accuracy': accuracy}

## Árvore de Decisão

In [19]:
dtc = DecisionTreeClassifier(random_state=42, max_depth=1)
dtc.fit(X_train, Y_train)

prediction = dtc.predict(X_test)

accuracy = metrics.accuracy_score(Y_test, prediction) * 100
print("Acurácia do algoritmo de Regressão Logística (%)", accuracy)
print("Confusion matrix", confusion_matrix(Y_test, prediction))
print("Classification report", classification_report(Y_test, prediction))

model_with_accuracy['dtc'] =  {'model': dtc, 'accuracy': accuracy}

Acurácia do algoritmo de Regressão Logística (%) 94.34233641883354
Confusion matrix [[     0  11014]
 [     0 183660]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report               precision    recall  f1-score   support

        Alto       0.00      0.00      0.00     11014
       Baixo       0.94      1.00      0.97    183660

    accuracy                           0.94    194674
   macro avg       0.47      0.50      0.49    194674
weighted avg       0.89      0.94      0.92    194674



  _warn_prf(average, modifier, msg_start, len(result))


## Salva o modelo final em um arquivo .pkl

In [20]:
max_accuracy = 0
for m in model_with_accuracy:
  if model_with_accuracy[m]['accuracy'] > max_accuracy:

    max_accuracy = model_with_accuracy[m]['accuracy']
    # Salvar o modelo que melhor deu resultado para fazer o dump com pickle.
    model = model_with_accuracy[m]['model']

print(model, max_accuracy)

DecisionTreeClassifier(max_depth=1, random_state=42) 94.34233641883354


In [22]:
# Salvando o modelo treinado final.

# Here you can replace pickle with joblib or cloudpickle
from pickle import dump
with open("model.pkl", "wb") as f:
    dump(model, f, protocol=5)