<a href="https://colab.research.google.com/github/rfdornelles/mds_ML_project/blob/main/baseline_xgboost_karon2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/rfdornelles/mds_ML_project/main/data/karon2.csv")

In [2]:
df.head()

Unnamed: 0,year,country,temp,population,qnt_death_heat_cold_exposure
0,1990,Albania,6.16894,3286542,5
1,1991,Albania,6.219891,3266790,5
2,1992,Albania,6.28493,3247039,5
3,1993,Albania,6.324316,3227287,5
4,1994,Albania,6.357706,3207536,5


In [3]:
X = df.drop(["qnt_death_heat_cold_exposure", "country"], axis = 1).to_numpy()
y = df["qnt_death_heat_cold_exposure"].to_numpy()

In [4]:
#https://www.kaggle.com/code/stuarthallows/using-xgboost-with-scikit-learn/notebook 

In [10]:
import numpy as np

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import r2_score

import xgboost as xgb

In [11]:
def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

In [12]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [14]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

xgb_model.fit(X, y)

y_pred = xgb_model.predict(X)

mse=mean_squared_error(y, y_pred)

print(np.sqrt(mse))
print("R2:", r2_score(y, y_pred))

71.07708683265109
R2: 0.9844827964118767


In [15]:
xgb_model

XGBRegressor(objective='reg:squarederror', random_state=42)

In [16]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=13
)

scores = []

In [18]:
for train_index, test_index in kfold.split(X):   
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    
    scores.append(mean_squared_error(y_test, y_pred))
    
display_scores(np.sqrt(scores))
print("R2:", r2_score(y_test, y_pred))

Scores: [101.89116211 109.56929809 136.12066374 169.74672316  83.29322645
 101.89116211 109.56929809 136.12066374 169.74672316  83.29322645]
Mean: 120.124
Std: 30.062
R2: 0.9756626555428621


In [20]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

scores = cross_val_score(xgb_model, X, y, scoring="neg_mean_squared_error", cv=5)

display_scores(np.sqrt(-scores))

Scores: [ 250.63154304 1023.71392877  159.88790814   96.1534827  1217.97538258]
Mean: 549.672
Std: 472.944


In [24]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X, y)

report_best_scores(search.cv_results_, 1)


Fitting 3 folds for each of 200 candidates, totalling 600 fits
Model with rank: 1
Mean validation score: -0.923 (std: 1.425)
Parameters: {'colsample_bytree': 0.9040922615763338, 'gamma': 0.2252496259847715, 'learning_rate': 0.033979488347959955, 'max_depth': 2, 'n_estimators': 113, 'subsample': 0.9233589392465844}



In [26]:
mse = mean_squared_error(y_test, search.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 28699.3713


In [25]:
#print("coef:", pipe['model'].coef_)
#print("intercept:", pipe['model'].intercept_)
print("R2:", r2_score(y_test, search.predict(X_test)))

R2: 0.8993239423488908


In [30]:
best_params = {'colsample_bytree': 0.9040922615763338, 'gamma': 0.2252496259847715, 'learning_rate': 0.033979488347959955, 'max_depth': 2, 'n_estimators': 113, 'subsample': 0.9233589392465844}

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", 
                             random_state=42, 
                             params = best_params)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse=mean_squared_error(y_test, y_pred)

print(np.sqrt(mse))
print("R2:", r2_score(y_test, y_pred))



83.29322645118359
R2: 0.9756626555428621
