In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib

from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score, \
                            auc, confusion_matrix, accuracy_score, \
                            classification_report

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
df_data_general = pd.read_csv('../../data/data_general.csv')

In [3]:
q25, q50, q75 = df_data_general['TIME_ON_DEVICE_SEC'].quantile([0.25, 0.5, 0.75])

bins = [0, q25, q50, q75, float('inf')]
labels = [0, 1, 2, 3]

df_data_general['time_on_device_label'] = pd.cut(
    df_data_general['TIME_ON_DEVICE_SEC'],
    bins=bins,
    labels=labels,
    include_lowest=True
).astype(int)

In [4]:
df_data_general[['INITIAL_AMOUNT', 'Rango_Edad_le','Cluster', 'TIME_ON_DEVICE_SEC', 'time_on_device_label', 'GAMES_PLAYED_TOTAL']].corr()

Unnamed: 0,INITIAL_AMOUNT,Rango_Edad_le,Cluster,TIME_ON_DEVICE_SEC,time_on_device_label,GAMES_PLAYED_TOTAL
INITIAL_AMOUNT,1.0,-0.0,0.02,0.07,0.12,0.09
Rango_Edad_le,-0.0,1.0,-0.79,0.07,0.08,0.06
Cluster,0.02,-0.79,1.0,0.14,0.02,0.17
TIME_ON_DEVICE_SEC,0.07,0.07,0.14,1.0,0.6,0.89
time_on_device_label,0.12,0.08,0.02,0.6,1.0,0.56
GAMES_PLAYED_TOTAL,0.09,0.06,0.17,0.89,0.56,1.0


In [16]:
X = df_data_general[['INITIAL_AMOUNT','Cluster', 'AVG_BET', 'GAMES_PLAYED_TOTAL']]
y = df_data_general['time_on_device_label']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [18]:
std_scaler = StandardScaler()

knn = Pipeline([
        ('scaler', std_scaler),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn.fit(X_train, y_train)

In [19]:
y_hat = knn.predict(X_test)

In [20]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.75      0.86      0.81     16382
           1       0.59      0.62      0.61     16189
           2       0.62      0.60      0.61     16171
           3       0.88      0.75      0.81     16120

    accuracy                           0.71     64862
   macro avg       0.71      0.71      0.71     64862
weighted avg       0.71      0.71      0.71     64862



In [21]:
from statsmodels.stats.contingency_tables import mcnemar
con_matrix = confusion_matrix(y_test, y_hat)
result = mcnemar(con_matrix, exact=True)
print('p-value:', result.pvalue)

p-value: 1.182081463122352e-115


In [22]:

unique, counts = np.unique(y_test, return_counts=True)
baseline = counts.max() / counts.sum()
print("Baseline (clase mayoritaria):", baseline)

Baseline (clase mayoritaria): 0.25256698837531993


In [23]:
from statsmodels.stats.proportion import proportions_ztest

# tus datos
n_total = len(y_test)
n_correct = (y_test == y_hat).sum()
baseline = 0.6155992723011933

stat, pval = proportions_ztest(n_correct, n_total, baseline)
print("p-value:", pval)

p-value: 0.0


In [26]:
joblib.dump(knn, '../../models/time_on_device_pipeline_foliattiGeneral_v1.pkl')

['../../models/time_on_device_pipeline_foliattiGeneral_v1.pkl']