# Cargar y explorar los datos:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score

# If necessary, install required packages (use pip in the terminal)
# pip install matplotlib scikit-learn pandas
# pip install pandas-profiling

# 2. Cargar el dataset
df_train = pd.read_csv('./dataset/train.csv')  # Change this to the correct local path
df_test = pd.read_csv('./dataset/test.csv')


In [None]:
# Resumen de la información del dataframe
df_train.info()
df_train.head()

In [None]:
missing_values = df_train.isnull().sum()
print(missing_values)


# Rellenar valores faltantes:
En Publisher, rellenamos con una categoría como "Unknown" si no es crucial.
Para Summary, la eliminamos ya que no es relevante para la clasificación o utilizar técnicas de procesamiento de texto si planeas incluirla.

In [None]:
# Rellenar valores faltantes
def clean_data(df):
    # Fill missing 'Year' values with the median of the column
    df['Year'] = df['Year'].fillna(df['Year'].median())
    
    # Fill missing 'Publisher' values with 'Unknown'
    df['Publisher'] = df['Publisher'].fillna('Unknown')
    
    # Drop the 'Summary' column if it exists
    if 'Summary' in df.columns:
        df = df.drop(columns=['Summary'])
    
    # Check and return remaining missing values
    missing_values = df.isnull().sum()
    
    # Print missing values
    print("Missing values after cleaning:")
    print(missing_values)



    
    return df


# Convertir variables categóricas:

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, is_train=True):    
    # Limpiar los datos
    df = clean_data(df)
    
    # Función auxiliar para convertir cadenas con 'K' a valores numéricos
    def convert_to_numeric(value):
        value_str = str(value)
        if 'K' in value_str:
            return int(float(value_str.replace('K', '')) * 1000)
        else:
            return int(float(value_str))
    
    # Aplicar la conversión a columnas numéricas
    columns_to_convert = [
        'Europe', 'Japan', 'Rest of World', 'North America', 
        'Global', 'Number of Reviews', 'Wishlist'
    ]
    for column in columns_to_convert:
        if column in df.columns:
            df[column] = df[column].apply(convert_to_numeric)
    
    # Codificación de variables categóricas
    label_encoder = LabelEncoder()
    categorical_columns = ['Game Title', 'Publisher', 'Platform', 'Genre']
    for column in categorical_columns:
        if column in df.columns:
            df[column] = label_encoder.fit_transform(df[column])
    
    # Separar características y objetivo en el conjunto de entrenamiento
    if is_train:
        # Si es el conjunto de entrenamiento, devolver características y objetivo
        X = df.drop(columns=['id', 'Rating'], errors='ignore')
        y = df['Rating'] if 'Rating' in df.columns else None
        return X, y
    else:
        # Si es el conjunto de prueba, eliminar 'id' y 'Rating' y devolver solo características
        X = df.drop(columns=['id', 'Rating'], errors='ignore')
        return X

# Ejemplo de uso:
# X_train, y_train = preprocess_data(df_train, is_train=True)
# X_test = preprocess_data(df_test, is_train=False)


# Separar características y objetivo

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np

def find_best_depth(X, Y, test_size=0.33, max_depth=20, cv=5, random_state=42):
    """
    Finds the best max_depth for a DecisionTreeRegressor using cross-validation.
    
    Parameters:
    - X (pd.DataFrame): Features.
    - Y (pd.Series): Target variable.
    - test_size (float): Proportion of data to use as the test set.
    - max_depth (int): Maximum depth to evaluate.
    - cv (int): Number of cross-validation folds.
    - random_state (int): Random seed for reproducibility.
    
    Returns:
    - best_depth (int): The depth that gives the lowest average MSE.
    - best_score (float): The corresponding mean squared error for the best depth.
    """
    
    # Split data into training and testing sets
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    
    # Define range of depths to evaluate
    depth_range = range(1, max_depth + 1)
    mean_scores = []
    
    # Perform cross-validation for each depth
    for depth in depth_range:
        model = DecisionTreeRegressor(max_depth=depth, criterion='squared_error', random_state=random_state)
        scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring='neg_mean_squared_error')
        mean_scores.append(np.mean(np.abs(scores)))  # Store the mean absolute error
    
    # Determine the best depth based on mean score
    best_depth = depth_range[np.argmin(mean_scores)]
    best_score = min(mean_scores)
    
    print(f"Best depth: {best_depth}")
    print(f"Average Mean Squared Error with cross-validation: {best_score:.4f}")
    
    return best_depth, best_score

# Example usage:
# best_depth, best_score = find_best_depth(X, Y)





# Hacer arbol y evaluarlo:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split


def train_and_evaluate_decision_tree(X, Y, best_depth, test_size=0.33, random_state=42):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    model = DecisionTreeRegressor(max_depth=best_depth, criterion='squared_error', random_state=random_state)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    mse = mean_squared_error(Y_test, Y_pred)
    print(f"Mean Squared Error (MSE) on test set: {mse:.4f}")
    
    # Plot the decision tree
    plt.figure(figsize=(15, 10))
    tree.plot_tree(model, filled=True, feature_names=X.columns, fontsize=10)
    plt.title(f"Decision Tree (Depth = {best_depth})")
    plt.show()
    
    return model, mse





Correr todas las funciones

In [None]:
def run_decision_tree_workflow(df_train, df_test):
    # Preprocesar datos
    X_train, Y_train = preprocess_data(df_train, is_train=True)
    X_test = preprocess_data(df_test, is_train=False)
    
    # Encontrar la mejor profundidad del árbol
    best_depth, best_score = find_best_depth(X_train, Y_train)
    
    # Entrenar y evaluar el modelo con la mejor profundidad
    best_model, mse = train_and_evaluate_decision_tree(X_train, Y_train, best_depth)
    
    # Retornar los resultados
    return {
        "best_depth": best_depth,
        "best_score": best_score,
        "model": best_model,
        "mse": mse
    }

Evaluar test

In [None]:
def run_model_on_test(best_model, df_test):
    # Asegurarse de que los datos de prueba están preprocesados de la misma forma
    X_test = preprocess_data(df_test, is_train=False)
    
    # Realizar predicciones con el modelo entrenado
    predictions = best_model.predict(X_test)
    
    return predictions

workflow_results = run_decision_tree_workflow(df_train, df_test)
best_model = workflow_results["model"]  # Extrae el modelo desde los resultados de workflow
df_results = run_model_on_test(best_model, df_test)

submission = pd.DataFrame({
    'id': df_test['id'],  # Usar df_test para obtener los identificadores
    'Prediction': df_results
})

# Guardar el DataFrame como un archivo CSV
submission.to_csv('submission#2.csv', index=False)

In [None]:
# Crear un DataFrame para las predicciones

#evaluar nuestro modelo con test


In [None]:
from joblib import dump, load
dump(best_model, 'tree.joblib')


Importamos librerias

In [None]:
from joblib import dump, load
dump(best_model, 'tree.joblib')

#model.load('tree.joblib')
#model.predict(X_test)
 #  Guardar el modelo en un archivo