performance metrics of code with snippets without splitting

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Function to calculate MAPE (Mean Absolute Percentage Error)
def safe_mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    nonzero_indices = y_true != 0  # Exclude zero values from calculation
    return np.mean(np.abs((y_true[nonzero_indices] - y_pred[nonzero_indices]) / y_true[nonzero_indices])) * 100

# Load the dataset
data = pd.read_excel("no_splitting.xlsx")

# Features and target
X = data.drop(columns=["Final_Marks"])
y = data["Final_Marks"]

# Perform PCA to reduce features to size 20
pca = PCA(n_components=20, random_state=42)
X_pca = pca.fit_transform(X)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, stratify=y, random_state=42
)

# Models to evaluate
models = {
    "CatBoost": CatBoostRegressor(
        verbose=0
    ),
    "XGBoost": XGBRegressor(
    ),
    "RandomForest": RandomForestRegressor(
    ),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "LinearRegression": LinearRegression()
}

# Metrics dictionary
metrics = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Metrics for train and test
    metrics[name] = {
        "Train MAE": mean_absolute_error(y_train, y_train_pred),
        "Train MSE": mean_squared_error(y_train, y_train_pred),
        "Train RMSE": mean_squared_error(y_train, y_train_pred, squared=False),
        "Train MAPE": safe_mape(y_train, y_train_pred),
        "Train R2": r2_score(y_train, y_train_pred),
        "Test MAE": mean_absolute_error(y_test, y_test_pred),
        "Test MSE": mean_squared_error(y_test, y_test_pred),
        "Test RMSE": mean_squared_error(y_test, y_test_pred, squared=False),
        "Test MAPE": safe_mape(y_test, y_test_pred),
        "Test R2": r2_score(y_test, y_test_pred)
    }

# Display the results
results = pd.DataFrame(metrics).T
print(results)




                  Train MAE  Train MSE  Train RMSE  Train MAPE  Train R2  \
CatBoost           0.564901   0.525085    0.724628   12.940167  0.910745   
XGBoost            0.162230   0.099569    0.315546    3.599425  0.983075   
RandomForest       0.556822   0.537777    0.733333   15.559471  0.908588   
DecisionTree       0.035195   0.068359    0.261455    0.647092  0.988380   
LinearRegression   1.841958   5.293779    2.300821   57.566885  0.100158   

                  Test MAE  Test MSE  Test RMSE  Test MAPE   Test R2  
CatBoost          1.372641  3.107095   1.762695  37.555771  0.468329  
XGBoost           1.427061  3.656514   1.912201  39.208314  0.374316  
RandomForest      1.448420  3.380490   1.838611  44.083124  0.421547  
DecisionTree      1.738192  5.403284   2.324496  43.652240  0.075417  
LinearRegression  1.953587  6.078405   2.465442  61.207059 -0.040106  


