In [3]:
import sys
import os

# Ruta al proyecto raíz donde está la carpeta 'scripts'
project_root = os.path.abspath('../../')  # o usa la ruta absoluta si prefieres
sys.path.append(project_root)

In [4]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score, \
                            auc, confusion_matrix, accuracy_score, \
                            classification_report

from sklearn.preprocessing import label_binarize
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sqlalchemy.orm import sessionmaker
from sklearn.preprocessing import LabelEncoder

from datetime import datetime
from dotenv import load_dotenv
from sqlalchemy import create_engine

import joblib

In [5]:
df_data_general = pd.read_csv('../../data/data_general.csv')

In [6]:
X = df_data_general[['AVG_BET','INITIAL_AMOUNT', 'GAMES_PLAYED_TOTAL', 'GAMES_WON_TOTAL', 'Rango_Edad_le']]
y = df_data_general['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

In [7]:
scaler = StandardScaler()

In [8]:
knn_pipeline_std = Pipeline([
        ('scaler', scaler),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn_pipeline_std.fit(X_train, y_train)

knn_std_y_pred = knn_pipeline_std.predict(X_test)

print(classification_report(y_test, knn_std_y_pred))
print(accuracy_score(y_test, knn_std_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     33587
           1       0.98      0.97      0.97      5625
           2       1.00      1.00      1.00     24953
           3       0.99      0.95      0.97       697

    accuracy                           1.00     64862
   macro avg       0.99      0.98      0.98     64862
weighted avg       1.00      1.00      1.00     64862

0.9953902130677438


In [9]:
query = """ 
BEGIN
    DECLARE @TotalRows INT;
    DECLARE @SampleSize INT;
    DECLARE @Q1 FLOAT, @Q3 FLOAT, @IQR FLOAT;
    DECLARE @LowerBound FLOAT, @UpperBound FLOAT;

    -- Contar el total de filas
    SELECT @TotalRows = COUNT(*)
    FROM dbo.BI_GAME_SESSIONS gs
    INNER JOIN dbo.BI_CARDS crd ON crd.CARD_ID = gs.CARD_ID
    INNER JOIN dbo.BI_PLAYERS pl ON pl.PLAYER_ID = crd.PLAYER_ID
    WHERE gs.INITIAL_TIME >= '2024-01-01';

    -- Calcular tamaño de muestra
    SET @SampleSize = CEILING(@TotalRows * 0.01);

    -- Calcular Q1 y Q3 usando PERCENTILE_CONT
    WITH OrderedValues AS (
        SELECT CAST(gs.INITIAL_AMOUNT AS FLOAT) AS INITIAL_AMOUNT
        FROM dbo.BI_GAME_SESSIONS gs
        INNER JOIN dbo.BI_CARDS crd ON crd.CARD_ID = gs.CARD_ID
        INNER JOIN dbo.BI_PLAYERS pl ON pl.PLAYER_ID = crd.PLAYER_ID
        WHERE gs.INITIAL_TIME >= '2024-01-01'
    )
  

    -- Seleccionar muestra filtrando outliers
    SELECT TOP (@SampleSize)
           pl.PLAYER_ID,
           pl.DOB,
           gs.AVG_BET,
           gs.INITIAL_AMOUNT,
           gs.GAMES_PLAYED_TOTAL,
           gs.GAMES_WON_TOTAL
    FROM dbo.BI_GAME_SESSIONS gs
    INNER JOIN dbo.BI_CARDS crd ON crd.CARD_ID = gs.CARD_ID
    INNER JOIN dbo.BI_PLAYERS pl ON pl.PLAYER_ID = crd.PLAYER_ID
    WHERE gs.INITIAL_TIME >= '2024-01-01'
    ORDER BY CHECKSUM(NEWID());
END
"""

In [15]:
load_dotenv()


server = os.getenv('SERVER')
username = os.getenv('SQL_USERNAME')
password = os.getenv('SQL_PASSWORD')

## Neza ##
database = 'ewise_BI_All'
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_string)
Session = sessionmaker(bind=engine)
session = Session()
df = pd.read_sql_query(query, engine)

In [16]:
df['DOB'] = pd.to_datetime(df['DOB'])
hoy = datetime.now()
df['Edad'] = hoy.year - df['DOB'].dt.year
df['Edad'] -= ((hoy.month < df['DOB'].dt.month) | 
               ((hoy.month == df['DOB'].dt.month) & 
                (hoy.day < df['DOB'].dt.day)))

bins = [18, 25, 35, 45, 55, 65, 100]  # 100 es un valor arbitrario para cubrir edades mayores a 65
labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']

# Crear la columna 'Rango_Edad'
df['Rango_Edad'] = pd.cut(df['Edad'], bins=bins, labels=labels, right=False)

# le_edad = LabelEncoder()
# df['Rango_Edad_le'] = le_edad.fit_transform(df['Rango_Edad'])

In [17]:
mapping = dict(zip(df_data_general['Rango_Edad'], df_data_general['Rango_Edad_le']))

print(mapping)

df['Rango_Edad_le'] = df['Rango_Edad'].map(mapping)

{'18-24': 0, '45-54': 3, '55-64': 4, '65+': 5, '25-34': 1, '35-44': 2, nan: 6}


In [18]:
df = df.dropna()

In [19]:
X = df[['AVG_BET','INITIAL_AMOUNT', 'GAMES_PLAYED_TOTAL', 'GAMES_WON_TOTAL', 'Rango_Edad_le']]
y_hat_test = knn_pipeline_std.predict(X)

In [20]:
df['Prediction'] = y_hat_test

In [21]:
df.groupby('Prediction').agg({
    'Edad': 'mean',
    'AVG_BET': 'mean',
    'INITIAL_AMOUNT': 'mean',
    'GAMES_PLAYED_TOTAL': 'mean',
    'GAMES_WON_TOTAL': 'mean'
})

Unnamed: 0_level_0,Edad,AVG_BET,INITIAL_AMOUNT,GAMES_PLAYED_TOTAL,GAMES_WON_TOTAL
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,65.293702,3.091198,286.164233,135.386717,41.185849
1,58.802817,3.201119,352.214463,777.16679,249.4255
2,42.207706,3.740381,233.976135,132.899681,37.322389
3,59.141104,2.315951,276.364908,2171.294479,715.803681


In [22]:
df_data_general.groupby('Cluster').agg({
    'Edad': 'mean',
    'AVG_BET': 'mean',
    'INITIAL_AMOUNT': 'mean',
    'GAMES_PLAYED_TOTAL': 'mean',
    'GAMES_WON_TOTAL': 'mean'
})

Unnamed: 0_level_0,Edad,AVG_BET,INITIAL_AMOUNT,GAMES_PLAYED_TOTAL,GAMES_WON_TOTAL
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,65.646886,3.9762,262.245005,114.626366,33.531705
1,59.242393,4.042296,329.413599,775.925176,238.116946
2,43.062897,5.163098,269.169138,113.728336,33.551481
3,59.227312,3.728691,318.534131,2281.816335,734.659896


In [23]:
joblib.dump(knn_pipeline_std, '../../models/knn_pipeline_foliattiGeneral_v0.pkl')

['../../models/knn_pipeline_foliattiGeneral_v0.pkl']

In [24]:
df.head(1)

Unnamed: 0,PLAYER_ID,DOB,AVG_BET,INITIAL_AMOUNT,GAMES_PLAYED_TOTAL,GAMES_WON_TOTAL,Edad,Rango_Edad,Rango_Edad_le,Prediction
0,2B99B7E5-C944-4150-940B-77C52D182F56,1996-12-28,4.4,273.57,81.0,37.0,28,25-34,1,2
