In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Cargar los datos
df_train = pd.read_csv('./dataset/train.csv')  # Cambiar al path correcto
df_test = pd.read_csv('./dataset/test.csv')    # Cambiar al path correcto

# Limpieza de datos
def clean_data(df):
    df['Year'] = df['Year'].fillna(df['Year'].median())
    df['Publisher'] = df['Publisher'].fillna('Unknown')
    if 'Summary' in df.columns:
        df = df.drop(columns=['Summary'])
    return df

# Preprocesamiento
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    df = clean_data(df)
    
    def convert_to_numeric(value):
        value_str = str(value)
        if 'K' in value_str:
            return int(float(value_str.replace('K', '')) * 1000)
        else:
            return int(float(value_str))
    
    columns_to_convert = [
        'Europe', 'Japan', 'Rest of World', 'North America', 
        'Global', 'Number of Reviews', 'Wishlist'
    ]
    for column in columns_to_convert:
        if column in df.columns:
            df[column] = df[column].apply(convert_to_numeric)
    
    label_encoder = LabelEncoder()
    categorical_columns = ['Game Title', 'Publisher', 'Platform', 'Genre']
    for column in categorical_columns:
        if column in df.columns:
            df[column] = label_encoder.fit_transform(df[column])
    
    return df

# División en conjuntos de entrenamiento y prueba
def split_data(df):
    df = preprocess_data(df)
    X = df.drop(columns=['Rating'])
    y = df['Rating']
    return train_test_split(X, y, test_size=0.33, random_state=42)


import wandb
wandb.init(
    project="OBL-MACHINE-LEARNING-2024",
    name="LinearRegression"
)

# Entrenamiento y evaluación del modelo
def train_and_evaluate(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Cálculo de métricas
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Evaluación del modelo:")
    print(f"MSE (Error Cuadrático Medio): {mse:.4f}")
    print(f"RMSE (Raíz del Error Cuadrático Medio): {rmse:.4f}")
    print(f"MAE (Error Absoluto Medio): {mae:.4f}")
    print(f"R² (Coeficiente de Determinación): {r2:.4f}")

    wandb.log({
        "mse": mse,
        "rmse": rmse,
        "mae": mae,
        "r2": r2
    })

    
    return model, y_pred

# Validación cruzada
def cross_validation(X, y):
    model = LinearRegression()
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    mse_scores = -scores
    mean_mse = mse_scores.mean()
    std_mse = mse_scores.std()
    
    print("Validación cruzada:")
    print(f"Media del MSE: {mean_mse:.4f}")
    print(f"Desviación estándar del MSE: {std_mse:.4f}")

    return mean_mse, std_mse

# Predicción en el conjunto de prueba
def predict_on_test(model, df_test):
    X_test = preprocess_data(df_test)
    predictions = model.predict(X_test)
    return predictions

# Ejecución del flujo completo
df_train = preprocess_data(df_train)
X_train, X_test, y_train, y_test = split_data(df_train)
model, y_pred = train_and_evaluate(X_train, X_test, y_train, y_test)
cross_validation(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

df_test = preprocess_data(df_test)
predictions = predict_on_test(model, df_test)

# Guardar resultados
submission = pd.DataFrame({
    'id': df_test['id'],  # Cambiar según la estructura del archivo
    'Prediction': predictions
})
submission.to_csv('LinearRegressionResults.csv', index=False)


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: sv267108 (sv267108-universidad-ort-uruguay). Use `wandb login --relogin` to force relogin


Evaluación del modelo:
MSE (Error Cuadrático Medio): 0.1558
RMSE (Raíz del Error Cuadrático Medio): 0.3947
MAE (Error Absoluto Medio): 0.2676
R² (Coeficiente de Determinación): 0.3250
Validación cruzada:
Media del MSE: 0.1657
Desviación estándar del MSE: 0.0165
