In [14]:
import sys
import os

# Ruta al proyecto raíz donde está la carpeta 'scripts'
project_root = os.path.abspath('../../')  # o usa la ruta absoluta si prefieres
sys.path.append(project_root)

In [15]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score, \
                            auc, confusion_matrix, accuracy_score, \
                            classification_report

from datetime import datetime
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

import joblib

In [16]:
df_data_general = pd.read_csv('../../data/data_general.csv')

In [17]:
query = """ 
BEGIN
    DECLARE @TotalRows INT;
    DECLARE @SampleSize INT;
    DECLARE @Q1 FLOAT, @Q3 FLOAT, @IQR FLOAT;
    DECLARE @LowerBound FLOAT, @UpperBound FLOAT;

    -- Contar el total de filas
    SELECT @TotalRows = COUNT(*)
    FROM dbo.BI_GAME_SESSIONS gs
    INNER JOIN dbo.BI_CARDS crd ON crd.CARD_ID = gs.CARD_ID
    INNER JOIN dbo.BI_PLAYERS pl ON pl.PLAYER_ID = crd.PLAYER_ID
    WHERE gs.INITIAL_TIME >= '2024-01-01';

    -- Calcular tamaño de muestra
    SET @SampleSize = CEILING(@TotalRows * 0.01);

    -- Calcular Q1 y Q3 usando PERCENTILE_CONT
    WITH OrderedValues AS (
        SELECT CAST(gs.INITIAL_AMOUNT AS FLOAT) AS INITIAL_AMOUNT
        FROM dbo.BI_GAME_SESSIONS gs
        INNER JOIN dbo.BI_CARDS crd ON crd.CARD_ID = gs.CARD_ID
        INNER JOIN dbo.BI_PLAYERS pl ON pl.PLAYER_ID = crd.PLAYER_ID
        WHERE gs.INITIAL_TIME >= '2024-01-01'
    )
  

    -- Seleccionar muestra filtrando outliers
    SELECT TOP (@SampleSize)
           pl.PLAYER_ID,
           pl.DOB,
           pl.GENDER,
           gs.AVG_BET,
           gs.BET_TOTAL,
           gs.INITIAL_AMOUNT,
           gs.INITIAL_PROMO_AMOUNT,
           gs.FINAL_AMOUNT,
           gs.FINAL_PROMO_AMOUNT,
           gs.MACHINE_ID,
           gs.WIN_TOTAL,
           gs.GAMES_PLAYED_TOTAL,
           gs.GAMES_WON_TOTAL,
           gs.TIME_ON_DEVICE_SEC
    FROM dbo.BI_GAME_SESSIONS gs
    INNER JOIN dbo.BI_CARDS crd ON crd.CARD_ID = gs.CARD_ID
    INNER JOIN dbo.BI_PLAYERS pl ON pl.PLAYER_ID = crd.PLAYER_ID
    WHERE gs.INITIAL_TIME >= '2024-01-01'
    ORDER BY CHECKSUM(NEWID());
END
"""

In [18]:
cluster_classifier = joblib.load('../../models/knn_pipeline_foliattiGeneral_v0.pkl')
bet_total_classifier = joblib.load('../../models/time_on_device_pipeline_foliattiGeneral_v0.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [19]:
load_dotenv()


server = os.getenv('SERVER')
username = os.getenv('SQL_USERNAME')
password = os.getenv('SQL_PASSWORD')

## Neza ##
database = 'ewise_BI_All'
connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_string)
Session = sessionmaker(bind=engine)
session = Session()
df = pd.read_sql_query(query, engine)

In [20]:
df['DOB'] = pd.to_datetime(df['DOB'])
hoy = datetime.now()
df['Edad'] = hoy.year - df['DOB'].dt.year
df['Edad'] -= ((hoy.month < df['DOB'].dt.month) | 
               ((hoy.month == df['DOB'].dt.month) & 
                (hoy.day < df['DOB'].dt.day)))

bins = [18, 25, 35, 45, 55, 65, 100]  # 100 es un valor arbitrario para cubrir edades mayores a 65
labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']

# Crear la columna 'Rango_Edad'
df['Rango_Edad'] = pd.cut(df['Edad'], bins=bins, labels=labels, right=False)

# le_edad = LabelEncoder()
# df['Rango_Edad_le'] = le_edad.fit_transform(df['Rango_Edad'])

In [21]:
mapping = dict(zip(df_data_general['Rango_Edad'], df_data_general['Rango_Edad_le']))

print(mapping)

df['Rango_Edad_le'] = df['Rango_Edad'].map(mapping)

{'18-24': 0, '45-54': 3, '55-64': 4, '65+': 5, '25-34': 1, '35-44': 2, nan: 6}


In [27]:
def define_time_on_device(time):
    time /= 60
    if time < 10:
        return 0
    if time > 10 and time < 30:
        return 1
    if time >30 and time < 60:
        return 2
    else:
        return 3
    
df['time_on_device_label'] =df['TIME_ON_DEVICE_SEC'].apply(define_time_on_device)

In [28]:
df = df.dropna()

In [29]:
X = df[['AVG_BET','INITIAL_AMOUNT', 'GAMES_PLAYED_TOTAL', 'GAMES_WON_TOTAL', 'Rango_Edad_le']]
y_hat_test = cluster_classifier.predict(X)

In [30]:
df['Cluster'] = y_hat_test

In [31]:
X = df[['INITIAL_AMOUNT', 'Rango_Edad_le','Cluster', 'AVG_BET']]
y = df['time_on_device_label']
y_hat_bet_total = bet_total_classifier.predict(X)

In [32]:
print(classification_report(y, y_hat_bet_total))

              precision    recall  f1-score   support

           0       0.68      0.85      0.76      6484
           1       0.41      0.23      0.30      3198
           2       0.52      0.45      0.48      1144
           3       0.58      0.43      0.49       577

    accuracy                           0.62     11403
   macro avg       0.55      0.49      0.51     11403
weighted avg       0.58      0.62      0.59     11403

