# Regresion Lineal

In [80]:
# Cargar y explorar los datos:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score

# If necessary, install required packages (use pip in the terminal)
# pip install matplotlib scikit-learn pandas
# pip install pandas-profiling

# 2. Cargar el dataset
df_train = pd.read_csv('./dataset/train.csv')  # Change this to the correct local path
df_test = pd.read_csv('./dataset/test.csv')


## Rellenar valores faltantes:

In [81]:
# Rellenar valores faltantes
def clean_data(df):
    # Fill missing 'Year' values with the median of the column
    df['Year'] = df['Year'].fillna(df['Year'].median())
    
    # Fill missing 'Publisher' values with 'Unknown'
    df['Publisher'] = df['Publisher'].fillna('Unknown')
    
    # Drop the 'Summary' column if it exists
    if 'Summary' in df.columns:
        df = df.drop(columns=['Summary'])
    
    # Check and return remaining missing values
    missing_values = df.isnull().sum()
    
    # Print missing values
    print("Missing values after cleaning:")
    print(missing_values)

    return df

# Preprocesamiento de Datos

In [82]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, is_train=True):    
    # Helper function to convert string with 'K' to numeric
    df= clean_data(df)
    def convert_to_numeric(value):
        value_str = str(value)
        if 'K' in value_str:
            return int(float(value_str.replace('K', '')) * 1000)
        else:
            return int(float(value_str))
    
    # Apply conversion to numeric columns
    columns_to_convert = [
        'Europe', 'Japan', 'Rest of World', 'North America', 
        'Global', 'Number of Reviews', 'Wishlist'
    ]
    for column in columns_to_convert:
        if column in df.columns:
            df[column] = df[column].apply(convert_to_numeric)
    
    # Label encoding for specified categorical columns
    label_encoder = LabelEncoder()
    categorical_columns = ['Game Title', 'Publisher', 'Platform', 'Genre']
    df['Publisher'] = label_encoder.fit_transform(df['Publisher'])
    df['Platform'] = label_encoder.fit_transform(df['Platform'])
    df['Genre'] = label_encoder.fit_transform(df['Genre'])
    df['Game Title'] = label_encoder.fit_transform(df['Game Title'])
    

        # Return the entire DataFrame for test data
    print(df.head())
    return df

# Particionar los datos

In [83]:
from sklearn.model_selection import train_test_split

def split_data(df):
    df = preprocess_data(df)
    random_state = 117

    X, y = df.loc[:, df.columns != 'Rating'], df["Rating"]
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=(1.0/3),
        random_state=random_state)
    return X, y

# Entrenar

In [84]:

# Implemente un arbol de regresion y entrenelo
from sklearn.tree import DecisionTreeRegressor

def train(X_train, X_test, y_train, y_test):
# Crear el árbol de decisión para regresión
    arbol = DecisionTreeRegressor(random_state=random_state)

    # Entrenar el modelo con los datos de entrenamiento
    arbol.fit(X_train, y_train)

    # Predecir sobre los datos de prueba
    y_pred = arbol.predict(X_test)

    # Evaluar el modelo (opcional: aquí puedes calcular el R^2 o el error medio cuadrático)
    from sklearn.metrics import mean_squared_error, r2_score

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Error cuadrático medio: {mse}")
    print(f"R^2: {r2}")
    return y_pred



# Evaluacion

In [85]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def eval(X_train, X_test, y_train, y_test):
# Hacer predicciones sobre los datos de prueba
    y_pred = arbol.predict(X_test)

    # Calcular el Error Absoluto Medio (MAE)
    mae = mean_absolute_error(y_test, y_pred)

    # Calcular el Error Cuadrático Medio (MSE)
    mse = mean_squared_error(y_test, y_pred)

    # Calcular la Raíz del Error Cuadrático Medio (RMSE)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Imprimir las métricas
    print(f"Error Absoluto Medio (MAE): {mae}")
    print(f"Error Cuadrático Medio (MSE): {mse}")
    print(f"Raíz del Error Cuadrático Medio (RMSE): {rmse}")

# K-Fold Cross Validation

In [86]:
# Elija un sesgo inductivo y realice la tecnica de model selection Cross validation
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

def kfold(X, y):
# Definir el modelo
    arbol = DecisionTreeRegressor(random_state=random_state)

    # Definir el número de folds para la validación cruzada
    folds = 5  # Puedes ajustar este valor según el tamaño del conjunto de datos

    # Realizar la validación cruzada con la métrica de 'neg_mean_squared_error'
    scores = cross_val_score(arbol, X, y, cv=folds, scoring='neg_mean_squared_error')

    # Convertir los scores a valores positivos y calcular la media y desviación estándar
    mse_scores = -scores  # Invertir los signos ya que es neg_mean_squared_error
    mean_mse = mse_scores.mean()
    std_mse = mse_scores.std()

    print(f"Mean MSE: {mean_mse}")
    print(f"Standard Deviation of MSE: {std_mse}")
    return mean_mse, std_mse

# Creo el árbol de decisión para regresión

In [87]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Definir un rango de hiperparámetros a probar (ejemplo de sesgos inductivos)
def DecTrees(X,y):
    max_depths = [3, 5,6,7,8,9, 10,11,12,13,14]  # Profundidades del árbol
    min_samples_leaf = [1, 5, 10]  # Mínimo número de muestras por hoja

    best_score = float('inf')
    best_params = {}

    # Realizar validación cruzada para cada combinación de hiperparámetros
    for max_depth in max_depths:
        for min_leaf in min_samples_leaf:
            # Crear el modelo con los hiperparámetros actuales
            arbol = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_leaf, random_state=random_state)

            # Realizar validación cruzada
            scores = cross_val_score(arbol, X, y, cv=5, scoring='neg_mean_squared_error')
            mean_mse = -scores.mean()  # Convertir a MSE positivo

            # Imprimir los resultados
            print(f"Max depth: {max_depth}, Min samples leaf: {min_leaf}, Mean MSE: {mean_mse:.4f}")

            # Guardar el mejor modelo
            if mean_mse < best_score:
                best_score = mean_mse
                best_params = {'max_depth': max_depth, 'min_samples_leaf': min_leaf}

    # Mostrar los mejores hiperparámetros
    print(f"Mejores hiperparámetros: {best_params}, con un MSE promedio de {best_score:.4f}")
    return best_params, best_score


# Do & Evaluate best tree

In [88]:
def train_and_evaluate_decision_tree(X, Y, best_depth, test_size=0.33, random_state=42):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    model = DecisionTreeRegressor(max_depth=best_depth, criterion='squared_error', random_state=random_state)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    
    # Plot the decision tree
    plt.figure(figsize=(15, 10))
    tree.plot_tree(model, filled=True, feature_names=X.columns, fontsize=10)
    plt.show()
    
    return model, mse

# Call Functions to make best model

In [93]:
def run_model_on_test(best_model, df_test):
    # Asegurarse de que los datos de prueba están preprocesados de la misma forma
    X_test = preprocess_data(df_test)
    
    # Realizar predicciones con el modelo entrenado
    predictions = best_model.predict(X_test)
    
    return predictions

# Ejecuto el modelo

In [None]:
df_train= preprocess_data(df_train)
X, y = split_data(df_train)
best_params, best_score = DecTrees(X, y)
model = DecisionTreeRegressor(max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'], random_state=random_state)
model.fit(X, y)
df_results = run_model_on_test(model, df_test)

submission = pd.DataFrame({
    'id': df_test['id'],  # Usar df_test para obtener los identificadores
    'Prediction': df_results
})

# Guardar el DataFrame como un archivo CSV
submission.to_csv('RegResults.csv', index=False)
