In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn as skl
from sklearn.datasets import load_diabetes

from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import scipy.stats as stats

In [2]:
X, y = load_diabetes(return_X_y=True, as_frame=True, scaled=False)
X.head()

In [5]:
X = X.drop(columns=['s1', 's2', 's4', 's5', 's6'])
X.head()

Unnamed: 0,age,sex,bmi,bp,s3
0,59.0,2.0,32.1,101.0,38.0
1,48.0,1.0,21.6,87.0,70.0
2,72.0,2.0,30.5,93.0,41.0
3,24.0,1.0,25.3,84.0,40.0
4,50.0,1.0,23.0,101.0,52.0


In [6]:
y.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64

In [7]:
df_full = pd.concat([X, y], axis=1)
df_full

Unnamed: 0,age,sex,bmi,bp,s3,target
0,59.0,2.0,32.1,101.00,38.0,151.0
1,48.0,1.0,21.6,87.00,70.0,75.0
2,72.0,2.0,30.5,93.00,41.0,141.0
3,24.0,1.0,25.3,84.00,40.0,206.0
4,50.0,1.0,23.0,101.00,52.0,135.0
...,...,...,...,...,...,...
437,60.0,2.0,28.2,112.00,42.0,178.0
438,47.0,2.0,24.9,75.00,42.0,104.0
439,60.0,2.0,24.9,99.67,43.0,132.0
440,36.0,1.0,30.0,95.00,42.0,220.0


In [8]:
cm = df_full.corr()
sns.heatmap(data=cm, cmap='Spectral')

In [11]:
stats.pearsonr(df_full['bmi'], df_full['target'])

PearsonRResult(statistic=0.5864501344746887, pvalue=3.4660064451654114e-42)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(353, 5) (353,)
(89, 5) (89,)


XGBoost part is based on notebook for parameter tuning of XGBoost (@author: prasun.mishra): https://www.kaggle.com/code/prasunmishra/parameter-tuning-for-xgboost-sklearn/notebook


In [32]:
from xgboost.sklearn import XGBRegressor
import xgboost as xgb

In [40]:
def modelfit(model, dtrain, predictors, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    """
    model
    dtrain - dataset with training data (include X and y (target))
    predictors - list of columns to use as predictors

    """
    
    print('*** Here in modelfit ******* Point 1')
    if useTrainCV:
        print('*** Here in modelfit ******* Point 2')            
        xgb_param = model.get_xgb_params()
        print('*** Here in modelfit ******* Point 3')
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        print('*** Here in modelfit ******* Point 4')
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
            metrics='mae', early_stopping_rounds=early_stopping_rounds, verbose_eval=10)
        model.set_params(n_estimators=cvresult.shape[0])
        print('*** Here in modelfit ******* Point 5')
    
    #Fit the algorithm on the data
    print('*** Here in modelfit ******* Point 5.5')
    print ("Here predictors are:", predictors)
    model.fit(dtrain[predictors], dtrain[target], eval_metric='mae')
    #model.fit(dtrain[predictors], dtrain['logerror'])
    
    
    print('*** Here in modelfit ******* Point 6')
    #Predict training set:
    dtrain_predictions = model.predict(dtrain[predictors])
    print('*** Here in modelfit ******* Point 7')
    #dtrain_predprob = model.predict_proba(dtrain[predictors])[:,1]
    print('*** Here in modelfit ******* Point 8')
        
    #Print model report:
    print ("\nModel Report")
    print ("\nMSE Score (Train): %f" % mean_squared_error(dtrain[target], dtrain_predictions)) #dtrain_predprob
    

In [52]:
#predictors = [x for x in train.columns if x not in [target, IDcol]] # assume predictors = list(X_train.columns)
xgb1 = XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'reg:squarederror',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)
modelfit(xgb1, pd.concat([X_train, y_train], axis=1), list(X_train.columns), 'target')
print('Test score:', mean_squared_error(y_test, xgb1.predict(X_test)))

*** Here in modelfit ******* Point 1
*** Here in modelfit ******* Point 2
*** Here in modelfit ******* Point 3
*** Here in modelfit ******* Point 4
[0]	train-mae:138.55365+2.02979	test-mae:138.66132+9.12244
[10]	train-mae:56.20094+1.24672	test-mae:64.51807+6.74434
[20]	train-mae:34.76605+1.27816	test-mae:52.14200+5.09968
[30]	train-mae:28.00447+1.06544	test-mae:50.31711+4.73571
[40]	train-mae:24.45733+1.07606	test-mae:50.37269+4.57789
[50]	train-mae:21.50383+1.01165	test-mae:50.55913+4.48194
[60]	train-mae:18.82254+0.86506	test-mae:51.11329+4.65215
[70]	train-mae:16.69459+0.76870	test-mae:51.29466+4.71410
[78]	train-mae:15.02788+0.77486	test-mae:51.12651+4.66794
*** Here in modelfit ******* Point 5
*** Here in modelfit ******* Point 5.5
Here predictors are: ['age', 'sex', 'bmi', 'bp', 's3']
*** Here in modelfit ******* Point 6
*** Here in modelfit ******* Point 7
*** Here in modelfit ******* Point 8

Model Report

MSE Score (Train): 1482.590909
Test score: 3470.1008685572697




In [53]:
param_test1 = {
    'max_depth': range(3,10,2),
    'min_child_weight': range(1,6,2)
}

print('****** This is point 3')
gsearch1 = GridSearchCV(
    estimator = XGBRegressor(
        learning_rate=0.1, n_estimators=140, max_depth=5,
        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
        objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27
    ), 
    param_grid = param_test1, scoring='neg_mean_squared_error', n_jobs=4, cv=5 #iid=False,
)

print('****** This is point 4')
gsearch1.fit(X_train, y_train)
print('****** This is point 5')
print(gsearch1.best_params_, gsearch1.best_score_) #gsearch1.grid_scores_, 
print('****** This is point 6')
print('Best estimator train score:', mean_squared_error(y_train, gsearch1.best_estimator_.predict(X_train)))
print('Best estimator test score:', mean_squared_error(y_test, gsearch1.best_estimator_.predict(X_test)))

****** This is point 3
****** This is point 4
****** This is point 5
{'max_depth': 3, 'min_child_weight': 3} -4096.205627735911
****** This is point 6
Best estimator train score: 1140.3866660678384
Best estimator test score: 3629.454735740943


In [54]:
pd.concat([X_train, y_train], axis=1).to_csv('datasets/train.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('datasets/test.csv', index=False)

In [55]:
import pickle
with open('models/xgb_v1.pkl', 'wb') as f:
    pickle.dump(xgb1, f)
    print('Model save completed')

Model save completed
