In [20]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score, \
                            auc, confusion_matrix, accuracy_score, \
                            classification_report

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
pd.set_option('display.float_format', '{:.2f}'.format)

In [21]:
df_data_general = pd.read_csv('../../data/data_general.csv')

In [22]:
def define_time_on_device(time):
    time /= 60
    if time < 10:
        return 0
    if time > 10 and time < 30:
        return 1
    if time >30 and time < 60:
        return 2
    else:
        return 3
    

In [23]:
df_data_general['time_on_device_label'] =df_data_general['TIME_ON_DEVICE_SEC'].apply(define_time_on_device)

In [24]:
df_data_general[['INITIAL_AMOUNT', 'Rango_Edad_le','Cluster', 'TIME_ON_DEVICE_SEC', 'time_on_device_label', 'GAMES_PLAYED_TOTAL']].corr()

Unnamed: 0,INITIAL_AMOUNT,Rango_Edad_le,Cluster,TIME_ON_DEVICE_SEC,time_on_device_label,GAMES_PLAYED_TOTAL
INITIAL_AMOUNT,1.0,-0.0,0.02,0.07,0.08,0.09
Rango_Edad_le,-0.0,1.0,-0.79,0.07,0.08,0.06
Cluster,0.02,-0.79,1.0,0.14,0.08,0.17
TIME_ON_DEVICE_SEC,0.07,0.07,0.14,1.0,0.82,0.89
time_on_device_label,0.08,0.08,0.08,0.82,1.0,0.75
GAMES_PLAYED_TOTAL,0.09,0.06,0.17,0.89,0.75,1.0


In [25]:
X = df_data_general[['INITIAL_AMOUNT', 'Rango_Edad_le','Cluster', 'AVG_BET']]
y = df_data_general['time_on_device_label']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [27]:
std_scaler = StandardScaler()

knn = Pipeline([
        ('scaler', std_scaler),
        ('clf', KNeighborsClassifier(n_neighbors=4))
    ])
knn.fit(X_train, y_train)

In [28]:
y_hat = knn.predict(X_test)

In [29]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.70      0.90      0.79     40013
           1       0.39      0.19      0.25     16614
           2       0.50      0.39      0.44      5393
           3       0.60      0.38      0.46      2842

    accuracy                           0.65     64862
   macro avg       0.55      0.46      0.49     64862
weighted avg       0.60      0.65      0.61     64862



In [33]:
from statsmodels.stats.contingency_tables import mcnemar
con_matrix = confusion_matrix(y_test, y_hat)
result = mcnemar(con_matrix, exact=True)
print('p-value:', result.pvalue)

p-value: 0.0


In [36]:

unique, counts = np.unique(y_test, return_counts=True)
baseline = counts.max() / counts.sum()
print("Baseline (clase mayoritaria):", baseline)

Baseline (clase mayoritaria): 0.6168943294995529


In [38]:
from statsmodels.stats.proportion import proportions_ztest

# tus datos
n_total = len(y_test)
n_correct = (y_test == y_hat).sum()
baseline = 0.6168943294995529  

stat, pval = proportions_ztest(n_correct, n_total, baseline)
print("p-value:", pval)

p-value: 1.6136578213851851e-69


In [11]:
gaussian = Pipeline([
        ('scaler', std_scaler),
        ('clf', GaussianNB())
    ])
gaussian.fit(X_train, y_train)

In [12]:
y_hat = gaussian.predict(X_test)
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.62      0.99      0.76     39985
           1       0.23      0.01      0.01     16541
           2       0.40      0.00      0.00      5419
           3       0.00      0.00      0.00      2917

    accuracy                           0.61     64862
   macro avg       0.31      0.25      0.19     64862
weighted avg       0.47      0.61      0.47     64862



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
random = Pipeline([
        ('scaler', std_scaler),
        ('clf', RandomForestClassifier())
    ])
random.fit(X_train, y_train)

In [14]:
y_hat = random.predict(X_test)
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.72      0.79      0.75     39985
           1       0.36      0.30      0.33     16541
           2       0.42      0.36      0.39      5419
           3       0.54      0.45      0.49      2917

    accuracy                           0.61     64862
   macro avg       0.51      0.48      0.49     64862
weighted avg       0.59      0.61      0.60     64862



In [16]:
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

X = df_data_general[['INITIAL_AMOUNT', 'Rango_Edad_le','Cluster', 'AVG_BET']]
y = df_data_general['time_on_device_label']

y_encoded = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=0.7)

model = Sequential([
    Input(shape=(4,)),                 # capa de entrada explícita
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax')
])

2025-08-23 08:54:19.824243: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# 8. Entrenar el modelo


In [18]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=1)

2025-08-23 08:54:20.023830: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
loss, accuracy = model.evaluate(X_test, y_test)

