# Regresion Logistica

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from joblib import dump

# Cargar datasets
df_train = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')

# Resumen de los datos
print(df_train.info())
print(df_train.head())

# Limpieza de datos

In [None]:
def clean_data(df):
    # Rellenar valores faltantes
    df['Year'] = df['Year'].fillna(df['Year'].median())
    df['Publisher'] = df['Publisher'].fillna('Unknown')

    # Eliminar columnas irrelevantes
    if 'Summary' in df.columns:
        df = df.drop(columns=['Summary', 'Game Title', 'Platform'])

    return df

df_train = clean_data(df_train)

# Preprocesar los datos

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(df):
    # Escalar las columnas numéricas
    scaler = StandardScaler()
    numerical_cols = ['North America', 'Europe', 'Japan', 'Rest of World', 'Global', 'Number of Reviews', 'Wishlist']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Codificar columnas categóricas
    label_encoder = LabelEncoder()
    categorical_cols = ['Publisher', 'Genre']
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])

    return df

df_train = preprocess_data(df_train)

# Crear la variable objetivo binaria
df_train['High_Rating'] = (df_train['Rating'] >= 4).astype(int)

# Separar las variables predictoras y objetivo
X = df_train.drop(columns=['Rating', 'High_Rating'])
y = df_train['High_Rating']

# Separar los datos en entrenamiento y prueba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo de regresión logistica

In [None]:
# Crear y entrenar el modelo
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

# Imprimir coeficientes
print("Coeficientes del modelo:", logistic_model.coef_)
print("Intercepto:", logistic_model.intercept_)

# Evaluar el modelo

In [None]:
# Predicciones
y_pred = logistic_model.predict(X_test)

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.show()

# Reporte de clasificación
print(classification_report(y_test, y_pred))

# Guardar el modelo