In [2]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

from sklearn.decomposition import PCA

In [3]:
df_data_general = pd.read_csv('../../data/data_general.csv')

In [4]:
df_data_general = df_data_general[df_data_general['GAMES_PLAYED_TOTAL'] > 0]

In [5]:
def define_games_played(number):
    if number < 50:
        return 0
    elif number >= 50 and number <= 100:
        return 1
    elif number >= 101 and number <= 200:
        return 2
    elif number >= 201 and number <= 500:
        return 3
    else:
        return 4

In [6]:
df_data_general['GAMES_PLAYED_TOTAL'].describe()

count    203247.000000
mean        206.336000
std         339.742731
min           1.000000
25%          44.000000
50%          98.000000
75%         225.000000
max       11426.000000
Name: GAMES_PLAYED_TOTAL, dtype: float64

In [7]:
df_data_general['games_played_label'] = df_data_general['GAMES_PLAYED_TOTAL'].apply(define_games_played)

In [8]:
df_data_general[['Cluster', 'INITIAL_AMOUNT', 'AVG_BET', 'GAMES_PLAYED_TOTAL', 'games_played_label']].corr(method='spearman')

Unnamed: 0,Cluster,INITIAL_AMOUNT,AVG_BET,GAMES_PLAYED_TOTAL,games_played_label
Cluster,1.0,0.035309,0.106898,0.06573,0.063414
INITIAL_AMOUNT,0.035309,1.0,0.529688,0.16477,0.159056
AVG_BET,0.106898,0.529688,1.0,-0.124074,-0.128114
GAMES_PLAYED_TOTAL,0.06573,0.16477,-0.124074,1.0,0.974328
games_played_label,0.063414,0.159056,-0.128114,0.974328,1.0


In [9]:
pca = PCA(n_components=2)
std_scaler = StandardScaler()
features = std_scaler.fit_transform(df_data_general[['Cluster', 'INITIAL_AMOUNT', 'AVG_BET']])
components = pca.fit_transform(features)
df_data_general['PCA1'] = components[:, 0]
df_data_general['PCA2'] = components[:, 1]

In [32]:
X = df_data_general[['Cluster', 'INITIAL_AMOUNT', 'AVG_BET']]
# X = df_data_general[['PCA1', 'PCA2']]
y = df_data_general['games_played_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=54)

In [None]:
# pipe = Pipeline([
#     ("scaler", StandardScaler()),
#     ("knn", KNeighborsClassifier())
# ])

# param_grid = {
#     "knn__n_neighbors": range(1, 11)
# }

# grid = GridSearchCV(pipe, param_grid=param_grid, cv=2, refit=True)

# grid.fit(X_train, y_train)

In [None]:
# print("Mejor k:", grid.best_params_)
# print("Mejor score:", grid.best_score_)

Mejor k: {'knn__n_neighbors': 10}
Mejor score: 0.42993702204228523


In [None]:
knn = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

knn.fit(X_train, y_train)

In [None]:
y_hat = knn.predict(X_test)

In [25]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.48      0.61      0.53     11608
           1       0.37      0.39      0.38      9239
           2       0.33      0.30      0.31      8557
           3       0.33      0.19      0.24      7432
           4       0.84      0.90      0.87      3814

    accuracy                           0.44     40650
   macro avg       0.47      0.48      0.47     40650
weighted avg       0.43      0.44      0.43     40650



In [26]:
unique, counts = np.unique(y_test, return_counts=True)
baseline = counts.max() / counts.sum()
print("Baseline (clase mayoritaria):", baseline)

Baseline (clase mayoritaria): 0.28555965559655594


In [None]:
from statsmodels.stats.proportion import proportions_ztest

n_total = len(y_test)
n_correct = (y_test == y_hat).sum()
baseline = 0.28555965559655594

stat, pval = proportions_ztest(n_correct, n_total, baseline)
print("p-value:", pval)

p-value: 0.0
