# 1. Model evaluation 

In [1]:
# import necessary libraries 
from sklearn.metrics import mean_squared_error,  mean_absolute_error

In [None]:
# 1/ Mean Absolute Error (MAE):
mae = mean_absolute_error(y_test, y_pred).round(2)
print(f'Mean Absolute Error (MAE): {mae}')

#2/ Mean Squared Error (MSE):
mse = mean_squared_error(y_test, y_pred).round(2)
print(f'Mean Squared Error (MSE): {mse}')

# 3/ Root Mean Squared Error (RMSE):
rmse = mean_squared_error(y_test, y_pred, squared=False).round(2)
print(f'Root Mean Squared Error (RMSE): {rmse}')

#4/ R-squared (Coefficient of Determination):
# Measures how well the variance in the target variable is explained by the model.
# Ranges from 0 to 1; a higher value indicates better performance.
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred).round(2)
print(f'R-squared: {r2}')


# 5/ Mean Absolute Percentage Error (MAPE):
#Measures the average percentage error between true and predicted values, which helps in understanding relative error. 
from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Mean Absolute Percentage Error (MAPE): {mape * 100}%')
print('mape before percentage:', mape)# out of curiosity I wanted to see the number without e+17

# 2. Error analysis

In [4]:
# import necessary libraries
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
#1/ Feature Importance:
# Analyzing feature importance helps in understanding which features the model relies on most. CatBoost provides built-in methods to visualize feature importance.
model.get_feature_importance(prettified=True)

In [None]:
# 2/ Residual Analysis:
#Plot residuals (errors) to check for patterns, which might indicate non-linearity, heteroscedasticity, or outliers.

residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')

#plt.xlim(left=-200,right= 2000)
#plt.ylim(bottom=-500, top= 500)

plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
# 3/ Error Distribution Analysis:
#Analyzing the distribution of errors can reveal whether your model consistently underpredicts or overpredicts.
# Plotting error distribution
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Error Distribution')
plt.show()

In [None]:
# 4/ Identify Outliers and High-Error Instances:
# Check for instances with unusually high errors which may be outliers or data quality issues. Understanding these can help in refining the model.
# Find rows with the highest residual errors
high_error_indices = residuals.nlargest(10).index
print(high_error_indices)

high_error_rows = df.loc[high_error_indices]
print(high_error_rows)

In [None]:
# 5 /Cross-Validation:
# Use cross-validation to assess how the model performs across different subsets of data, ensuring that performance isn’t due to random chance.

# Compute cross-validation scores
scores = cross_val_score(model, X, y, cv=10, scoring='neg_root_mean_squared_error')

# Convert negative RMSE to positive RMSE for interpretation
rmse_scores = -scores

# Print RMSE scores for each fold
print(f'Cross-Validation RMSE Scores: {rmse_scores}')

# Plot the RMSE scores for each fold
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rmse_scores) + 1), rmse_scores, marker='o', linestyle='-', color='b')
plt.title('Cross-Validation RMSE Scores')
plt.xlabel('Fold Number')
plt.ylabel('RMSE')
plt.xticks(range(1, len(rmse_scores) + 1))
plt.grid(True)
plt.show()


In [None]:
'''
**! I am not sure if this following error analysis  also works in Xgboost !**
Partial Dependence Plots have some limitations, including: 

1. They only show the relationship between a single feature and the model's predictions, which may not capture complex interactions between features.

 2. They require manual sorting or selection of interesting plots, which can be time-consuming and subjective.
 '''
# 6/ Partial dependence plots (PDP)
# PDPs help visualize the relationship between the target and specific features, helping you understand the model’s behavior.
#from catboost import plot_partial_dependence
#column_indices = {col: idx for idx, col in enumerate(df_model.columns)}
#print(column_indices)

#plot_partial_dependence(model, X, features=[0, 1])  # Replace with relevant feature indices