In [69]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import mlrose_hiive as mlrose

In [10]:
DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']

In [14]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Training dataset index: {train_index}")
    print(f"  Test dataset index: {test_index}")

Fold 0:
  Training dataset index: [   0    1    2 ... 4515 4516 4519]
  Test dataset index: [   8   14   17   19   23   26   29   32   33   43   45   51   61   69
   70   73   80   84   93   95   96  109  113  120  122  132  134  139
  144  149  150  151  152  157  166  168  170  175  177  179  180  184
  188  191  192  196  199  203  205  211  214  220  227  229  238  239
  240  252  254  270  274  279  287  289  290  291  296  297  298  305
  308  309  314  315  318  322  330  332  350  351  356  360  366  367
  371  376  387  393  402  408  410  414  415  416  426  429  432  433
  438  443  445  450  452  457  461  463  465  468  471  478  486  490
  494  497  505  506  511  518  530  534  538  544  551  555  561  564
  568  584  589  594  596  598  599  602  621  625  626  642  643  655
  668  670  676  677  680  683  691  693  705  718  720  721  731  733
  734  744  746  751  755  759  764  776  783  787  789  790  794  798
  800  802  805  807  810  811  812  829  833  838  842 

In [22]:
X_treino = X.iloc[train_index]
X_teste = X.iloc[test_index]
y_treino = y.iloc[train_index]
y_teste = y.iloc[test_index]

In [73]:
import mlrose_hiive as mlrose
import numpy as np
import matplotlib.pyplot as plt
import time
import itertools

DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=DEFAULT_SEED, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

start = time.process_time()
lista_precision = []
# Definir a função de fitness (objetivo a minimizar)
def fitness_function(weights):
    mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1, warm_start=True)

    mlp.fit(X_train_scaled, y_train)

    lista = np.random.randn(*mlp.coefs_[0].shape)
    pesos = weights[0:512]#np.random.rand(512).tolist()  # Converte para lista padrão do Python
    a = 0
    for i in range(0,len(lista)):
        for j in range(0,len(lista[i])):
            lista[i][j] = pesos[a]
            a = a + 1
    lista_saida = np.random.randn(*mlp.coefs_[1].shape)
    pesos_saida = weights[512:576]#np.random.rand(64).tolist()  # Converte para lista padrão do Python
    a = 0
    for i in range(0,len(lista_saida)):
        lista_saida[j] = pesos_saida[a]
        a = a + 1    
    test_accuracy = mlp.score(X_test_scaled, y_test)
    
    # Alterar os pesos manualmente
    mlp.coefs_[0] = lista#np.asarray([weights[0:64]], dtype=np.float32)  # Modificar pesos
    mlp.coefs_[1] = lista_saida#np.asarray([weights[64:128]], dtype=np.float32)  # Modificar pesos da saída

    #test_accuracy = mlp.score(X_test_scaled, y_test)
    y_pred = mlp.predict(X_test_scaled)
    precision = precision_score(y_test, y_pred) 
    return precision


# Criar problema contínuo de otimização
problem = mlrose.ContinuousOpt(length=576,
                               fitness_fn=mlrose.CustomFitness(fitness_function), 
                               maximize=True, 
                               min_val=-10,max_val=10)  # Restrições dos valores

max_iters_values = [50, 100, 200]   # Número máximo de iterações
restarts_values = [0, 5, 10, 20]       # Número de reinicializações
max_attempts_values = [10, 50, 100]    # Número de tentativas sem melhoria antes de parar

# Criar combinações de parâmetros
param_grid = list(itertools.product(max_iters_values, restarts_values, max_attempts_values))

# Variáveis para armazenar a melhor configuração
best_fitness = -np.inf
best_params = None
results = []

# Rodar Grid Search
for max_iters, restarts, max_attempts in param_grid:
    best_state, best_fitness_hc, _ = mlrose.random_hill_climb(problem, 
                                                              max_iters=max_iters, 
                                                              restarts=restarts, 
                                                              max_attempts=max_attempts, 
                                                              random_state=42)
    results.append((max_iters, restarts, max_attempts, best_fitness_hc))

    # Atualizar melhor configuração
    if best_fitness_hc > best_fitness:
        best_fitness = best_fitness_hc
        best_params = (max_iters, restarts, max_attempts)    
display("Melhores parâmetros para Hill Climbing são"+str(best_params))

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Indo...'

'Melhores parâmetros para Hill Climbing são(200, 5, 50)'

In [74]:
#Hill Climbing
#GridSearch usando a base normal
#Calcula tudo pegando esses parâmetros encontrados e usa o Kfold

import mlrose_hiive as mlrose
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import KFold

lista_convergências_hc = []
lista_fitness_hc = []
lista_tempos_hc = []


DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    start = time.process_time()

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]    
    start = time.process_time()

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  
    
    # Definir a função de fitness (objetivo a minimizar)
    def fitness_function(weights):
        mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1, warm_start=True)
        
        mlp.fit(X_train_scaled, y_train)
    
        lista = np.random.randn(*mlp.coefs_[0].shape)
        pesos = weights[0:512]#np.random.rand(512).tolist()  # Converte para lista padrão do Python
        a = 0
        for i in range(0,len(lista)):
            for j in range(0,len(lista[i])):
                lista[i][j] = pesos[a]
                a = a + 1
        lista_saida = np.random.randn(*mlp.coefs_[1].shape)
        pesos_saida = weights[512:576]#np.random.rand(64).tolist()  # Converte para lista padrão do Python
        a = 0
        for i in range(0,len(lista_saida)):
            lista_saida[j] = pesos_saida[a]
            a = a + 1    
        test_accuracy = mlp.score(X_test_scaled, y_test)
        
        # Alterar os pesos manualmente
        mlp.coefs_[0] = lista
        mlp.coefs_[1] = lista_saida
    
        test_accuracy = mlp.score(X_test_scaled, y_test)
        y_pred = mlp.predict(X_test_scaled)
        precision = precision_score(y_test, y_pred) 
        return precision
        
#        return test_accuracy
    
    
    # Criar problema contínuo de otimização
    problem = mlrose.ContinuousOpt(length=576,
                                   fitness_fn=mlrose.CustomFitness(fitness_function), 
                                   maximize=True, 
                                   min_val=0,max_val=1)  # Restrições dos valores
    
    # Definir os parâmetros do Hill Climbing
    np.random.seed(42)
    initial_state = np.random.uniform(-1, 1, size=576)
    
    # Rodar o algoritmo Hill Climbing
    best_state, best_fitness, fitness_curve  = mlrose.random_hill_climb(problem, 
                                                           max_attempts=50, 
                                                           max_iters=200, 
                                                           restarts=5,  # Adiciona reinicializações para evitar mínimos locais
                                                           init_state=initial_state, curve=True,
                                                           random_state=42)
    end = time.process_time()
    tempo = end - start  
    lista_tempos_hc.append(tempo)
    fitness_curve_list = []
    fitness_curve_list_2 = []
    z = 1
    for i in fitness_curve:
        fitness_curve_list.append(i[0])
        fitness_curve_list_2.append(z)
        z = z + 1  
    # Exibir os resultados
    #print(f"Melhor solução encontrada: {best_state}")
    lista_fitness_hc.append(best_fitness)
    lista_convergências_hc.append(fitness_curve)

Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:
Fold 6:
Fold 7:
Fold 8:
Fold 9:


In [75]:
DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=DEFAULT_SEED, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 



def fitness_function(weights):

    mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1, warm_start=True)

    mlp.fit(X_train_scaled, y_train)

    lista = np.random.randn(*mlp.coefs_[0].shape)
    pesos = weights[0:512]#np.random.rand(512).tolist()  # Converte para lista padrão do Python
    a = 0
    for i in range(0,len(lista)):
        for j in range(0,len(lista[i])):
            lista[i][j] = pesos[a]
            a = a + 1
    lista_saida = np.random.randn(*mlp.coefs_[1].shape)
    pesos_saida = weights[512:576]#np.random.rand(64).tolist()  # Converte para lista padrão do Python
    a = 0
    for i in range(0,len(lista_saida)):
        lista_saida[j] = pesos_saida[a]
        a = a + 1    
    test_accuracy = mlp.score(X_test_scaled, y_test)
    
    # Alterar os pesos manualmente
    mlp.coefs_[0] = lista#np.asarray([weights[0:64]], dtype=np.float32)  # Modificar pesos
    mlp.coefs_[1] = lista_saida#np.asarray([weights[64:128]], dtype=np.float32)  # Modificar pesos da saída

#    test_accuracy = mlp.score(X_test_scaled, y_test)
    y_pred = mlp.predict(X_test_scaled)
    precision = precision_score(y_test, y_pred) 
    return precision    
#    return test_accuracy

# Criar problema de otimização (13 pesos no total)
problem = mlrose.ContinuousOpt(length=576, fitness_fn=mlrose.CustomFitness(fitness_function), maximize=True)

# Configurar Simulated Annealing
schedule = mlrose.GeomDecay(init_temp=10, decay=0.95, min_temp=0.01)

init_temp_values = [10, 50, 100]  # Temperatura inicial
decay_values = [0.90, 0.95]    # Taxa de resfriamento
min_temp_values = [0.01, 0.001]      # Temperatura mínima
max_iters_values = [50, 100] # Iterações máximas
max_attempts_values = [10, 50, 100]  # Tentativas sem melhoria antes de parar

# Criar combinações de parâmetros
param_grid = list(itertools.product(init_temp_values, decay_values, min_temp_values, max_iters_values, max_attempts_values))

# Variáveis para armazenar a melhor configuração
best_fitness = -np.inf
best_params = None
results = []

# Rodar Grid Search
for init_temp, decay, min_temp, max_iters, max_attempts in param_grid:
    schedule = mlrose.GeomDecay(init_temp=init_temp, decay=decay, min_temp=min_temp)
    
    best_state, best_fitness_sa, _ = mlrose.simulated_annealing(problem, 
                                                                schedule=schedule, 
                                                                max_iters=max_iters, 
                                                                max_attempts=max_attempts, 
                                                                random_state=42)
    
    results.append((init_temp, decay, min_temp, max_iters, max_attempts, best_fitness_sa))

    # Atualizar melhor configuração
    if best_fitness_sa > best_fitness:
        best_fitness = best_fitness_sa
        best_params = (init_temp, decay, min_temp, max_iters, max_attempts)
display(best_params)

(10, 0.9, 0.01, 50, 10)

In [77]:
#Simulated Annealing
#GridSearch usando a base normal
#Calcula tudo pegando esses parâmetros encontrados e usa o Kfold
import mlrose_hiive as mlrose
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import KFold

lista_convergências_sa = []
lista_fitness_sa = []
lista_tempos_sa = []


DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    start = time.process_time()

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]    
    start = time.process_time()

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  

    # Definir função de fitness (erro da rede neural)
    def fitness_function(weights):
    
        mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1, warm_start=True)
    
        mlp.fit(X_train_scaled, y_train)
    
        lista = np.random.randn(*mlp.coefs_[0].shape)
        pesos = weights[0:512]#np.random.rand(512).tolist()  # Converte para lista padrão do Python
        a = 0
        for i in range(0,len(lista)):
            for j in range(0,len(lista[i])):
                lista[i][j] = pesos[a]
                a = a + 1
        lista_saida = np.random.randn(*mlp.coefs_[1].shape)
        pesos_saida = weights[512:576]#np.random.rand(64).tolist()  # Converte para lista padrão do Python
        a = 0
        for i in range(0,len(lista_saida)):
            lista_saida[j] = pesos_saida[a]
            a = a + 1    
        test_accuracy = mlp.score(X_test_scaled, y_test)
        
        # Alterar os pesos manualmente
        mlp.coefs_[0] = lista#np.asarray([weights[0:64]], dtype=np.float32)  # Modificar pesos
        mlp.coefs_[1] = lista_saida#np.asarray([weights[64:128]], dtype=np.float32)  # Modificar pesos da saída
    
        test_accuracy = mlp.score(X_test_scaled, y_test)
        y_pred = mlp.predict(X_test_scaled)
        precision = precision_score(y_test, y_pred) 
        return precision        
#        return test_accuracy
    
    # Criar problema de otimização (13 pesos no total)
    problem = mlrose.ContinuousOpt(length=576, fitness_fn=mlrose.CustomFitness(fitness_function), maximize=True)
    
    # Configurar Simulated Annealing
    schedule = mlrose.GeomDecay(init_temp=best_params[0], decay=best_params[1], min_temp=best_params[2])
    
    # Rodar Simulated Annealing para encontrar os melhores pesos
    best_weights, best_fitness, fitness_curve = mlrose.simulated_annealing(problem, schedule=schedule, max_iters=best_params[3], max_attempts=best_params[4],random_state=42, curve = True)
    
    end = time.process_time()
    tempo = end - start  # will print the time spent on this process in seconds
    # Definir a função de fitness personalizada (OneMax)
    lista_tempos_sa.append(tempo)
    #print("Melhores pesos encontrados:", best_weights)
    lista_fitness_sa.append(best_fitness) 
    lista_convergências_sa.append(fitness_curve)
    fitness_curve_list = []
    fitness_curve_list_2 = []
    z = 1
    for i in fitness_curve:
        fitness_curve_list.append(i[0])
        fitness_curve_list_2.append(z)
        z = z + 1    

Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:
Fold 6:
Fold 7:
Fold 8:
Fold 9:


In [93]:
#Algoritmo Genético
#GridSearch usando a base normal, mas sem ser o GridSerch convencional. Testa algumas combinações apenas.
#Calcula tudo pegando esses parâmetros encontrados e usa o Kfold
from joblib import Parallel, delayed
DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=DEFAULT_SEED, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 


def fitness_function(weights):

    mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1, warm_start=True)

    mlp.fit(X_train_scaled, y_train)

    lista = np.random.randn(*mlp.coefs_[0].shape)
    pesos = weights[0:512]#np.random.rand(512).tolist()  # Converte para lista padrão do Python
    a = 0
    for i in range(0,len(lista)):
        for j in range(0,len(lista[i])):
            lista[i][j] = pesos[a]
            a = a + 1
    lista_saida = np.random.randn(*mlp.coefs_[1].shape)
    pesos_saida = weights[512:576]#np.random.rand(64).tolist()  # Converte para lista padrão do Python
    a = 0
    for i in range(0,len(lista_saida)):
        lista_saida[j] = pesos_saida[a]
        a = a + 1    
    test_accuracy = mlp.score(X_test_scaled, y_test)
    
    # Alterar os pesos manualmente
    mlp.coefs_[0] = lista#np.asarray([weights[0:64]], dtype=np.float32)  # Modificar pesos
    mlp.coefs_[1] = lista_saida#np.asarray([weights[64:128]], dtype=np.float32)  # Modificar pesos da saída

    #test_accuracy = mlp.score(X_test_scaled, y_test)
    y_pred = mlp.predict(X_test_scaled)
    precision = precision_score(y_test, y_pred) 
    return precision        
    
    return test_accuracy

fitness = mlrose.CustomFitness(fitness_function)
problem = mlrose.DiscreteOpt(length=576, fitness_fn=fitness, maximize=True, max_val=2)

pop_size_values = [10, 50]        # Tamanho da população
mutation_prob_values = [0.05, 0.1]  # Probabilidade de mutação
max_iters_values = [50, 100]     # Número máximo de iterações
max_attempts_values = [10, 50]      # Número de gerações sem melhora antes de parar

# Criar combinações de parâmetros
param_grid = list(itertools.product(pop_size_values, mutation_prob_values, max_iters_values, max_attempts_values))

# Variáveis para armazenar a melhor configuração
best_fitness = -np.inf
best_params = None
results = []

# Função para rodar uma única configuração
def evaluate_params(params):
    pop_size, mutation_prob, max_iters, max_attempts = params
    best_state, best_fitness_ga, _ = mlrose.genetic_alg(problem, 
                                                        pop_size=pop_size, 
                                                        mutation_prob=mutation_prob, 
                                                        max_iters=max_iters, 
                                                        max_attempts=max_attempts, 
                                                        random_state=42)
    return (pop_size, mutation_prob, max_iters, max_attempts, best_fitness_ga)

# Rodar Grid Search em paralelo
results_parallel = Parallel(n_jobs=-1)(delayed(evaluate_params)(params) for params in param_grid)

# Encontrar melhor resultado
best_result = max(results_parallel, key=lambda x: x[4])  # Índice 4 é o fitness
best_params = best_result[:4]

In [97]:
import mlrose_hiive as mlrose
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import KFold

lista_convergências_ga = []
lista_fitness_ga = []
lista_tempos_ga = []


DEFAULT_SEED = 42
df = pd.read_csv('spotify-2023.csv', encoding='latin-1')
# Convert values to numbers
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df['target_chart_inclusion'] = (df['in_spotify_charts'] > 0).astype(int)
selected_features = ['danceability_%', 'energy_%', 'acousticness_%', 'valence_%','bpm', 'streams', 'artist_count', 'released_year']
processed_data = df[selected_features + ['target_chart_inclusion']].dropna()
# Split data
X = processed_data[selected_features]
y = processed_data['target_chart_inclusion']

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    start = time.process_time()

    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]    
    start = time.process_time()

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  

    def fitness_function(weights):
    
        mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1, warm_start=True)
    
        mlp.fit(X_train_scaled, y_train)
    
        lista = np.random.randn(*mlp.coefs_[0].shape)
        pesos = weights[0:512]#np.random.rand(512).tolist()  # Converte para lista padrão do Python
        a = 0
        for i in range(0,len(lista)):
            for j in range(0,len(lista[i])):
                lista[i][j] = pesos[a]
                a = a + 1
        lista_saida = np.random.randn(*mlp.coefs_[1].shape)
        pesos_saida = weights[512:576]#np.random.rand(64).tolist()  # Converte para lista padrão do Python
        a = 0
        for i in range(0,len(lista_saida)):
            lista_saida[j] = pesos_saida[a]
            a = a + 1    
        test_accuracy = mlp.score(X_test_scaled, y_test)
        
        # Alterar os pesos manualmente
        mlp.coefs_[0] = lista#np.asarray([weights[0:64]], dtype=np.float32)  # Modificar pesos
        mlp.coefs_[1] = lista_saida#np.asarray([weights[64:128]], dtype=np.float32)  # Modificar pesos da saída
    
        #y_pred = mlp.predict(X_test_scaled)
        #precision = precision_score(y_test, y_pred) 
        #return precision        
        
        return test_accuracy
    
    # Criar problema contínuo de otimização
    problem = mlrose.ContinuousOpt(length=576, 
                                   fitness_fn=mlrose.CustomFitness(fitness_function), 
                                   maximize=True,  # Queremos minimizar
                                   min_val=-1,max_val=1)  # Restrições dos valores
    
    # Configurar os parâmetros do Algoritmo Genético
    population_size = best_params[0]  # Número de indivíduos na população
    mutation_prob = best_params[1]    # Probabilidade de mutação
    max_attempts = best_params[3]      # Tentativas sem melhoria antes de parar
    max_iters = best_params[2]    # Número máximo de iterações
    
    # Rodar o Algoritmo Genético
    best_state, best_fitness, fitness_curve = mlrose.genetic_alg(problem, 
                                                                 pop_size=population_size, 
                                                                 mutation_prob=mutation_prob, 
                                                                 max_attempts=max_attempts, 
                                                                 max_iters=max_iters, 
                                                                 curve=True, 
                                                                 random_state=42)
    
    end = time.process_time()
    tempo = end - start  # will print the time spent on this process in seconds
    lista_tempos_ga.append(tempo)
    lista_fitness_ga.append(best_fitness)
    lista_convergências_ga.append(fitness_curve)

Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:
Fold 6:
Fold 7:
Fold 8:
Fold 9:


In [108]:
for i in lista_convergências_sa:
    lista_temp = i
    lista = []
    for j in i:
        lista.append(j[0])
    maximo = max(lista)
    p = 0
    for j in lista:
        p = p + 1
        if j == maximo:
            display(p)
            break

27

16

11

26

11

29

35

45

25

44

In [53]:
import pandas as pd
import math
def teste_T(l1,l2):
    m1 = sum(l1)/len(l1)
    m2 = sum(l2)/len(l2)
    tam_1 = len(l1)
    tam_2 = len(l2)
    df = pd.DataFrame([l1,l2]).T
    df.columns = ['l1','l2']
    df['dif'] = df['l1']-df['l2']
    m_dif = df['dif'].mean()
    dp_dif = df['dif'].std()
    s = dp_dif/math.sqrt(tam_1)
    t = m_dif/s
    g_liberdade = tam_1-1
    return t,g_liberdade

In [101]:
a = lista_fitness_sa
b = lista_fitness_ga
display(teste_T(a,b))
from scipy import stats
display(stats.ttest_rel(a,b)[0])

(-0.3541809773996573, 9)

-0.3541809773996572

In [104]:
for i in lista_convergências_ga:
    display(round(i,4))

TypeError: type numpy.ndarray doesn't define __round__ method

In [105]:
for i in lista_convergências_ga:
    display(i,4)

array([[5.20833333e-01, 2.20000000e+01],
       [5.20833333e-01, 3.30000000e+01],
       [5.20833333e-01, 4.40000000e+01],
       [5.20833333e-01, 5.50000000e+01],
       [5.20833333e-01, 6.60000000e+01],
       [5.41666667e-01, 7.80000000e+01],
       [4.47916667e-01, 9.00000000e+01],
       [4.58333333e-01, 1.02000000e+02],
       [5.31250000e-01, 1.14000000e+02],
       [5.52083333e-01, 1.26000000e+02],
       [5.52083333e-01, 1.37000000e+02],
       [5.52083333e-01, 1.48000000e+02],
       [5.52083333e-01, 1.59000000e+02],
       [5.62500000e-01, 1.71000000e+02],
       [5.00000000e-01, 1.83000000e+02],
       [5.31250000e-01, 1.95000000e+02],
       [5.31250000e-01, 2.06000000e+02],
       [5.31250000e-01, 2.17000000e+02],
       [5.31250000e-01, 2.28000000e+02],
       [4.68750000e-01, 2.40000000e+02],
       [4.68750000e-01, 2.51000000e+02],
       [4.58333333e-01, 2.63000000e+02],
       [5.20833333e-01, 2.75000000e+02],
       [5.20833333e-01, 2.86000000e+02],
       [5.520833

4

array([[3.85416667e-01, 2.30000000e+01],
       [4.68750000e-01, 3.50000000e+01],
       [5.20833333e-01, 4.70000000e+01],
       [6.14583333e-01, 5.90000000e+01],
       [6.14583333e-01, 7.00000000e+01],
       [6.14583333e-01, 8.10000000e+01],
       [6.14583333e-01, 9.20000000e+01],
       [6.14583333e-01, 1.03000000e+02],
       [6.14583333e-01, 1.14000000e+02],
       [6.14583333e-01, 1.25000000e+02],
       [6.14583333e-01, 1.36000000e+02],
       [6.14583333e-01, 1.47000000e+02],
       [6.14583333e-01, 1.58000000e+02],
       [6.14583333e-01, 1.69000000e+02],
       [6.14583333e-01, 1.80000000e+02],
       [4.89583333e-01, 1.92000000e+02],
       [3.85416667e-01, 2.04000000e+02],
       [3.64583333e-01, 2.16000000e+02],
       [5.20833333e-01, 2.28000000e+02],
       [4.16666667e-01, 2.40000000e+02],
       [4.16666667e-01, 2.52000000e+02],
       [5.93750000e-01, 2.64000000e+02],
       [5.93750000e-01, 2.75000000e+02],
       [5.93750000e-01, 2.86000000e+02],
       [5.937500

4

array([[5.47368421e-01, 2.30000000e+01],
       [5.47368421e-01, 3.40000000e+01],
       [5.47368421e-01, 4.50000000e+01],
       [5.15789474e-01, 5.70000000e+01],
       [5.78947368e-01, 6.90000000e+01],
       [4.21052632e-01, 8.10000000e+01],
       [5.89473684e-01, 9.30000000e+01],
       [5.89473684e-01, 1.04000000e+02],
       [5.89473684e-01, 1.15000000e+02],
       [5.89473684e-01, 1.26000000e+02],
       [5.89473684e-01, 1.37000000e+02],
       [6.00000000e-01, 1.49000000e+02],
       [6.00000000e-01, 1.60000000e+02],
       [6.00000000e-01, 1.71000000e+02],
       [3.68421053e-01, 1.83000000e+02],
       [5.05263158e-01, 1.95000000e+02],
       [5.05263158e-01, 2.06000000e+02],
       [5.68421053e-01, 2.18000000e+02],
       [5.68421053e-01, 2.29000000e+02],
       [5.68421053e-01, 2.40000000e+02],
       [6.00000000e-01, 2.52000000e+02],
       [6.00000000e-01, 2.63000000e+02],
       [6.00000000e-01, 2.74000000e+02],
       [6.21052632e-01, 2.86000000e+02],
       [6.421052

4

array([[4.94736842e-01, 2.20000000e+01],
       [4.94736842e-01, 3.40000000e+01],
       [4.52631579e-01, 4.60000000e+01],
       [4.52631579e-01, 5.70000000e+01],
       [7.05263158e-01, 6.90000000e+01],
       [7.05263158e-01, 8.00000000e+01],
       [7.05263158e-01, 9.10000000e+01],
       [7.05263158e-01, 1.02000000e+02],
       [7.05263158e-01, 1.13000000e+02],
       [7.05263158e-01, 1.24000000e+02],
       [7.05263158e-01, 1.35000000e+02],
       [7.05263158e-01, 1.46000000e+02],
       [7.05263158e-01, 1.57000000e+02],
       [7.05263158e-01, 1.68000000e+02],
       [7.05263158e-01, 1.79000000e+02],
       [7.05263158e-01, 1.90000000e+02],
       [7.05263158e-01, 2.01000000e+02],
       [7.05263158e-01, 2.12000000e+02],
       [7.05263158e-01, 2.23000000e+02],
       [7.05263158e-01, 2.34000000e+02],
       [7.05263158e-01, 2.45000000e+02],
       [7.05263158e-01, 2.56000000e+02],
       [7.05263158e-01, 2.67000000e+02],
       [7.05263158e-01, 2.78000000e+02],
       [7.052631

4

array([[5.15789474e-01, 2.30000000e+01],
       [5.15789474e-01, 3.40000000e+01],
       [5.15789474e-01, 4.50000000e+01],
       [5.78947368e-01, 5.70000000e+01],
       [5.78947368e-01, 6.80000000e+01],
       [4.10526316e-01, 8.00000000e+01],
       [5.15789474e-01, 9.20000000e+01],
       [5.15789474e-01, 1.03000000e+02],
       [3.89473684e-01, 1.15000000e+02],
       [6.21052632e-01, 1.27000000e+02],
       [6.21052632e-01, 1.38000000e+02],
       [6.21052632e-01, 1.49000000e+02],
       [6.21052632e-01, 1.60000000e+02],
       [6.21052632e-01, 1.71000000e+02],
       [6.10526316e-01, 1.83000000e+02],
       [6.10526316e-01, 1.94000000e+02],
       [6.10526316e-01, 2.05000000e+02],
       [6.10526316e-01, 2.16000000e+02],
       [6.10526316e-01, 2.27000000e+02],
       [6.10526316e-01, 2.38000000e+02],
       [6.10526316e-01, 2.49000000e+02],
       [6.10526316e-01, 2.60000000e+02],
       [6.10526316e-01, 2.71000000e+02],
       [6.10526316e-01, 2.82000000e+02],
       [4.736842

4

array([[5.78947368e-01, 2.20000000e+01],
       [5.15789474e-01, 3.40000000e+01],
       [5.15789474e-01, 4.50000000e+01],
       [4.73684211e-01, 5.70000000e+01],
       [5.05263158e-01, 6.90000000e+01],
       [5.78947368e-01, 8.10000000e+01],
       [5.78947368e-01, 9.20000000e+01],
       [5.78947368e-01, 1.03000000e+02],
       [5.78947368e-01, 1.14000000e+02],
       [5.36842105e-01, 1.26000000e+02],
       [5.36842105e-01, 1.37000000e+02],
       [4.73684211e-01, 1.49000000e+02],
       [4.73684211e-01, 1.60000000e+02],
       [5.78947368e-01, 1.72000000e+02],
       [5.78947368e-01, 1.83000000e+02],
       [5.78947368e-01, 1.94000000e+02],
       [5.78947368e-01, 2.05000000e+02],
       [5.78947368e-01, 2.16000000e+02],
       [4.84210526e-01, 2.28000000e+02],
       [4.73684211e-01, 2.40000000e+02],
       [5.05263158e-01, 2.52000000e+02],
       [5.36842105e-01, 2.64000000e+02],
       [5.36842105e-01, 2.75000000e+02],
       [4.94736842e-01, 2.87000000e+02],
       [4.631578

4

array([[5.36842105e-01, 2.20000000e+01],
       [5.36842105e-01, 3.30000000e+01],
       [5.36842105e-01, 4.40000000e+01],
       [6.10526316e-01, 5.60000000e+01],
       [6.10526316e-01, 6.70000000e+01],
       [6.10526316e-01, 7.80000000e+01],
       [6.10526316e-01, 8.90000000e+01],
       [6.10526316e-01, 1.00000000e+02],
       [6.10526316e-01, 1.11000000e+02],
       [6.10526316e-01, 1.22000000e+02],
       [6.10526316e-01, 1.33000000e+02],
       [6.10526316e-01, 1.44000000e+02],
       [6.10526316e-01, 1.55000000e+02],
       [6.10526316e-01, 1.66000000e+02],
       [6.10526316e-01, 1.77000000e+02],
       [6.10526316e-01, 1.88000000e+02],
       [6.10526316e-01, 1.99000000e+02],
       [6.10526316e-01, 2.10000000e+02],
       [6.10526316e-01, 2.21000000e+02],
       [6.10526316e-01, 2.32000000e+02],
       [6.10526316e-01, 2.43000000e+02],
       [6.10526316e-01, 2.54000000e+02],
       [6.10526316e-01, 2.65000000e+02],
       [6.10526316e-01, 2.76000000e+02],
       [6.105263

4

array([[4.52631579e-01, 2.30000000e+01],
       [4.52631579e-01, 3.40000000e+01],
       [5.36842105e-01, 4.60000000e+01],
       [5.36842105e-01, 5.70000000e+01],
       [4.42105263e-01, 6.90000000e+01],
       [5.57894737e-01, 8.10000000e+01],
       [5.57894737e-01, 9.20000000e+01],
       [5.57894737e-01, 1.03000000e+02],
       [5.57894737e-01, 1.14000000e+02],
       [5.57894737e-01, 1.25000000e+02],
       [5.57894737e-01, 1.36000000e+02],
       [5.57894737e-01, 1.47000000e+02],
       [4.73684211e-01, 1.59000000e+02],
       [6.10526316e-01, 1.71000000e+02],
       [6.10526316e-01, 1.82000000e+02],
       [6.10526316e-01, 1.93000000e+02],
       [6.10526316e-01, 2.04000000e+02],
       [6.10526316e-01, 2.15000000e+02],
       [6.10526316e-01, 2.26000000e+02],
       [6.10526316e-01, 2.37000000e+02],
       [6.10526316e-01, 2.48000000e+02],
       [6.10526316e-01, 2.59000000e+02],
       [6.10526316e-01, 2.70000000e+02],
       [6.10526316e-01, 2.81000000e+02],
       [6.105263

4

array([[4.84210526e-01, 2.30000000e+01],
       [4.84210526e-01, 3.40000000e+01],
       [4.84210526e-01, 4.50000000e+01],
       [5.78947368e-01, 5.70000000e+01],
       [5.78947368e-01, 6.80000000e+01],
       [4.00000000e-01, 8.00000000e+01],
       [4.84210526e-01, 9.20000000e+01],
       [4.84210526e-01, 1.03000000e+02],
       [4.52631579e-01, 1.15000000e+02],
       [6.00000000e-01, 1.27000000e+02],
       [6.00000000e-01, 1.38000000e+02],
       [6.00000000e-01, 1.49000000e+02],
       [6.00000000e-01, 1.60000000e+02],
       [6.00000000e-01, 1.71000000e+02],
       [6.00000000e-01, 1.82000000e+02],
       [6.00000000e-01, 1.93000000e+02],
       [6.00000000e-01, 2.04000000e+02],
       [6.00000000e-01, 2.15000000e+02],
       [6.00000000e-01, 2.26000000e+02],
       [6.00000000e-01, 2.37000000e+02],
       [6.00000000e-01, 2.48000000e+02],
       [6.00000000e-01, 2.59000000e+02],
       [6.00000000e-01, 2.70000000e+02],
       [5.05263158e-01, 2.82000000e+02],
       [4.631578

4

array([[4.73684211e-01, 2.20000000e+01],
       [5.15789474e-01, 3.40000000e+01],
       [4.42105263e-01, 4.60000000e+01],
       [4.42105263e-01, 5.70000000e+01],
       [4.84210526e-01, 6.90000000e+01],
       [4.84210526e-01, 8.00000000e+01],
       [4.73684211e-01, 9.20000000e+01],
       [4.73684211e-01, 1.03000000e+02],
       [5.05263158e-01, 1.15000000e+02],
       [4.63157895e-01, 1.27000000e+02],
       [4.63157895e-01, 1.38000000e+02],
       [5.26315789e-01, 1.50000000e+02],
       [5.57894737e-01, 1.62000000e+02],
       [5.26315789e-01, 1.74000000e+02],
       [5.68421053e-01, 1.86000000e+02],
       [5.78947368e-01, 1.98000000e+02],
       [5.78947368e-01, 2.09000000e+02],
       [5.78947368e-01, 2.20000000e+02],
       [5.78947368e-01, 2.31000000e+02],
       [5.78947368e-01, 2.42000000e+02],
       [5.78947368e-01, 2.53000000e+02],
       [5.78947368e-01, 2.64000000e+02],
       [5.78947368e-01, 2.75000000e+02],
       [5.78947368e-01, 2.86000000e+02],
       [5.789473

4

In [103]:
lista_fitness_hc

[0.75, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7058823529411765, 1.0, 1.0]

In [106]:
lista_convergências_ga[0]

array([[5.20833333e-01, 2.20000000e+01],
       [5.20833333e-01, 3.30000000e+01],
       [5.20833333e-01, 4.40000000e+01],
       [5.20833333e-01, 5.50000000e+01],
       [5.20833333e-01, 6.60000000e+01],
       [5.41666667e-01, 7.80000000e+01],
       [4.47916667e-01, 9.00000000e+01],
       [4.58333333e-01, 1.02000000e+02],
       [5.31250000e-01, 1.14000000e+02],
       [5.52083333e-01, 1.26000000e+02],
       [5.52083333e-01, 1.37000000e+02],
       [5.52083333e-01, 1.48000000e+02],
       [5.52083333e-01, 1.59000000e+02],
       [5.62500000e-01, 1.71000000e+02],
       [5.00000000e-01, 1.83000000e+02],
       [5.31250000e-01, 1.95000000e+02],
       [5.31250000e-01, 2.06000000e+02],
       [5.31250000e-01, 2.17000000e+02],
       [5.31250000e-01, 2.28000000e+02],
       [4.68750000e-01, 2.40000000e+02],
       [4.68750000e-01, 2.51000000e+02],
       [4.58333333e-01, 2.63000000e+02],
       [5.20833333e-01, 2.75000000e+02],
       [5.20833333e-01, 2.86000000e+02],
       [5.520833