In [None]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score, \
                            auc, confusion_matrix, accuracy_score, \
                            classification_report

from sklearn.preprocessing import label_binarize
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
df_data_general = pd.read_csv('../../data/data_general.csv')

In [None]:
X = df_data_general[['AVG_BET','INITIAL_AMOUNT', 'GAMES_PLAYED_TOTAL', 'GAMES_WON_TOTAL', 'Rango_Edad_le']]
y = df_data_general['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

In [None]:
scaler = StandardScaler()

# KNN

### Datos crudos

In [None]:
knn_pipeline_crudo = Pipeline([('clf', KNeighborsClassifier(n_neighbors=4))])
knn_pipeline_crudo.fit(X_train, y_train)

knn_crudo_y_pred = knn_pipeline_crudo.predict(X_test)

print(classification_report(y_test, knn_crudo_y_pred))
print(accuracy_score(y_test, knn_crudo_y_pred))

### Solo STD

In [None]:
knn_pipeline_std = Pipeline([
        ('scaler', scaler),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn_pipeline_std.fit(X_train, y_train)

knn_std_y_pred = knn_pipeline_std.predict(X_test)

print(classification_report(y_test, knn_std_y_pred))
print(accuracy_score(y_test, knn_std_y_pred))

### Solo PCA

In [None]:
knn_pipeline_pca = Pipeline([
        ('pca', PCA(n_components=2)),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn_pipeline_pca.fit(X_train, y_train)

knn_pca_y_pred = knn_pipeline_pca.predict(X_test)

print(classification_report(y_test, knn_pca_y_pred))
print(accuracy_score(y_test, knn_pca_y_pred))

### PCA y STD

In [None]:
knn_pipeline_pca_std = Pipeline([
        ('scaler', scaler),
        ('pca', PCA(n_components=2)),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn_pipeline_pca_std.fit(X_train, y_train)

knn_pca_std_y_pred = knn_pipeline_pca_std.predict(X_test)

print(classification_report(y_test, knn_pca_std_y_pred))
print(accuracy_score(y_test, knn_pca_std_y_pred))

### Seleccion de variables mas importantes

In [None]:
knn_pipeline_select = Pipeline([
        ('scaler', scaler),
        ('select', SelectKBest(score_func=f_classif, k=2)),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn_pipeline_select.fit(X_train, y_train)

knn_select_y_pred = knn_pipeline_select.predict(X_test)

print(classification_report(y_test, knn_select_y_pred))
print(accuracy_score(y_test, knn_select_y_pred))

# Ridge Classifier

### Datos crudos

In [None]:
ridge_pipeline_crudo = Pipeline([('clf', RidgeClassifier())])
ridge_pipeline_crudo.fit(X_train, y_train)

ridge_crudo_y_pred = ridge_pipeline_crudo.predict(X_test)

print(classification_report(y_test, ridge_crudo_y_pred))
print(accuracy_score(y_test, ridge_crudo_y_pred))

### Solo STD

In [None]:
ridge_pipeline_std = Pipeline([
        ('scaler', scaler),
        ('clf', RidgeClassifier())
    ])
ridge_pipeline_std.fit(X_train, y_train)

ridge_std_y_pred = ridge_pipeline_std.predict(X_test)

print(classification_report(y_test, ridge_std_y_pred))
print(accuracy_score(y_test, ridge_std_y_pred))

### Solo PCA

In [None]:
ridge_pipeline_pca = Pipeline([
        ('pca', PCA(n_components=2)),
        ('clf', RidgeClassifier())
    ])
ridge_pipeline_pca.fit(X_train, y_train)

ridge_pca_y_pred = ridge_pipeline_pca.predict(X_test)

print(classification_report(y_test, ridge_pca_y_pred))
print(accuracy_score(y_test, ridge_pca_y_pred))

### PCA y STD

In [None]:
ridge_pipeline_pca_std = Pipeline([
        ('scaler', scaler),
        ('pca', PCA(n_components=2)),
        ('clf', RidgeClassifier())
    ])
ridge_pipeline_pca_std.fit(X_train, y_train)

ridge_pca_std_y_pred = ridge_pipeline_pca_std.predict(X_test)

print(classification_report(y_test, ridge_pca_std_y_pred))
print(accuracy_score(y_test, ridge_pca_std_y_pred))

### Seleccion de variables mas importantes

In [None]:
ridge_pipeline_select = Pipeline([
        ('scaler', scaler),
        ('select', SelectKBest(score_func=f_classif, k=2)),
        ('clf', RidgeClassifier())
    ])
ridge_pipeline_select.fit(X_train, y_train)

ridge_select_y_pred = ridge_pipeline_select.predict(X_test)

print(classification_report(y_test, ridge_select_y_pred))
print(accuracy_score(y_test, ridge_select_y_pred))

# Gaussian

### Datos crudos

In [None]:
gaussian_pipeline_crudo = Pipeline([('clf', GaussianNB())])
gaussian_pipeline_crudo.fit(X_train, y_train)

gaussian_crudo_y_pred = gaussian_pipeline_crudo.predict(X_test)

print(classification_report(y_test, gaussian_crudo_y_pred))
print(accuracy_score(y_test, gaussian_crudo_y_pred))

### Solo STD

In [None]:
gaussian_pipeline_std = Pipeline([
        ('scaler', scaler),
        ('clf', GaussianNB())
    ])
gaussian_pipeline_std.fit(X_train, y_train)

gaussian_std_y_pred = gaussian_pipeline_std.predict(X_test)

print(classification_report(y_test, gaussian_std_y_pred))
print(accuracy_score(y_test, gaussian_std_y_pred))

### Solo PCA

In [None]:
gaussian_pipeline_pca = Pipeline([
        ('pca', PCA(n_components=2)),
        ('clf', GaussianNB())
    ])
gaussian_pipeline_pca.fit(X_train, y_train)

gaussian_pca_y_pred = gaussian_pipeline_pca.predict(X_test)

print(classification_report(y_test, gaussian_pca_y_pred))
print(accuracy_score(y_test, gaussian_pca_y_pred))

### PCA y STD

In [None]:
gaussian_pipeline_pca_std = Pipeline([
        ('scaler', scaler),
        ('pca', PCA(n_components=2)),
        ('clf', GaussianNB())
    ])
gaussian_pipeline_pca_std.fit(X_train, y_train)

gaussian_pca_std_y_pred = gaussian_pipeline_pca_std.predict(X_test)

print(classification_report(y_test, gaussian_pca_std_y_pred))
print(accuracy_score(y_test, gaussian_pca_std_y_pred))

### Seleccion de variables mas importantes

In [None]:
gaussian_pipeline_select = Pipeline([
        ('scaler', scaler),
        ('select', SelectKBest(score_func=f_classif, k=2)),
        ('clf', GaussianNB())
    ])
gaussian_pipeline_select.fit(X_train, y_train)

gaussian_select_y_pred = gaussian_pipeline_select.predict(X_test)

print(classification_report(y_test, gaussian_select_y_pred))
print(accuracy_score(y_test, gaussian_select_y_pred))

# Random Forest

### Datos crudos

In [None]:
random_pipeline_crudo = Pipeline([('clf', RandomForestClassifier())])
random_pipeline_crudo.fit(X_train, y_train)

random_crudo_y_pred = random_pipeline_crudo.predict(X_test)

print(classification_report(y_test, random_crudo_y_pred))
print(accuracy_score(y_test, random_crudo_y_pred))

### Solo STD

In [None]:
random_pipeline_std = Pipeline([
        ('scaler', scaler),
        ('clf', RandomForestClassifier())
    ])
random_pipeline_std.fit(X_train, y_train)

random_std_y_pred = random_pipeline_std.predict(X_test)

print(classification_report(y_test, random_std_y_pred))
print(accuracy_score(y_test, random_std_y_pred))

### Solo PCA

In [None]:
random_pipeline_pca = Pipeline([
        ('pca', PCA(n_components=2)),
        ('clf', RandomForestClassifier())
    ])
random_pipeline_pca.fit(X_train, y_train)

random_pca_y_pred = random_pipeline_pca.predict(X_test)

print(classification_report(y_test, random_pca_y_pred))
print(accuracy_score(y_test, random_pca_y_pred))

### PCA y STD

In [None]:
random_pipeline_pca_std = Pipeline([
        ('scaler', scaler),
        ('pca', PCA(n_components=2)),
        ('clf', RandomForestClassifier())
    ])
random_pipeline_pca_std.fit(X_train, y_train)

random_pca_std_y_pred = random_pipeline_pca_std.predict(X_test)

print(classification_report(y_test, random_pca_std_y_pred))
print(accuracy_score(y_test, random_pca_std_y_pred))

### Seleccion de variables mas importantes

In [None]:
random_pipeline_select = Pipeline([
        ('scaler', scaler),
        ('select', SelectKBest(score_func=f_classif, k=2)),
        ('clf', RandomForestClassifier())
    ])
random_pipeline_select.fit(X_train, y_train)

random_select_y_pred = random_pipeline_select.predict(X_test)

print(classification_report(y_test, random_select_y_pred))
print(accuracy_score(y_test, random_select_y_pred))