<a href="https://colab.research.google.com/github/skyway-ml-projects/linear-regression-tool-kit/blob/main/xgboost_regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [None]:
import pandas as pd
dataset = pd.read_csv('insurance.csv')

In [None]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Checking missing data

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Handling categorical variables

Sex column

In [None]:
dataset['sex'].unique()

array(['female', 'male'], dtype=object)

In [None]:
dataset['sex'] = dataset['sex'].apply(lambda x: 0 if x == 'female' else 1)

In [None]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


Smoker column

In [None]:
dataset['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [None]:
dataset['smoker'] = dataset['smoker'].apply(lambda x: 0 if x == 'no' else 1)

In [None]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [None]:
dataset['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [None]:
region_dummies = pd.get_dummies(dataset['region'], drop_first = True)

In [None]:
region_dummies

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [None]:
dataset = pd.concat([region_dummies, dataset], axis = 1)

In [None]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,0,0,1,19,0,27.9,0,1,southwest,16884.924
1,0,1,0,18,1,33.77,1,0,southeast,1725.5523
2,0,1,0,28,1,33.0,3,0,southeast,4449.462
3,1,0,0,33,1,22.705,0,0,northwest,21984.47061
4,1,0,0,32,1,28.88,0,0,northwest,3866.8552


In [None]:
dataset.drop(['region'], axis = 1, inplace = True)

In [None]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,0,0,1,19,0,27.9,0,1,16884.924
1,0,1,0,18,1,33.77,1,0,1725.5523
2,0,1,0,28,1,33.0,3,0,4449.462
3,1,0,0,33,1,22.705,0,0,21984.47061
4,1,0,0,32,1,28.88,0,0,3866.8552


### Creating the Training Set and the Test Set

Getting the inputs and output

In [None]:
X = dataset.iloc[:, :-1].values

In [None]:
y = dataset.iloc[:, -1].values

In [None]:
X

array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

In [None]:
y

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

Getting the Training Set and the Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Part 2 - Building and training the model

### Building the model

In [None]:
import xgboost
model = xgboost.XGBRegressor(max_depth = 2, learning_rate = 0.1, n_estimators = 100)

### Training the model

In [None]:
model.fit(X_train, y_train)



XGBRegressor(max_depth=2)

### Inference

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

array([12123.452 , 10226.204 , 44675.7   , 14727.231 , 10519.309 ,
        4285.9536,  2478.0562, 12992.854 ,  8282.596 ,  7207.9585,
        5826.118 , 11646.996 ,  9091.76  ,  5295.4985, 20105.365 ,
       11860.509 , 14064.408 ,  5881.954 ,  7876.936 , 35374.64  ,
       25521.076 , 14164.401 , 12653.041 , 25463.967 ,  3407.1606,
        6992.024 ,  3809.7744,  8335.29  ,  4505.3887, 10872.691 ,
        8331.48  , 46417.715 , 14565.901 , 12044.629 , 17231.049 ,
        5142.3315, 11219.184 , 38019.695 , 38714.816 ,  2860.5044,
        4384.7163,  4683.673 , 20990.736 , 45996.84  , 37044.598 ,
        5658.7124, 11860.509 ,  7194.0396,  5257.931 , 12498.075 ,
        4983.595 ,  4994.2207, 25737.668 , 44622.766 , 12174.301 ,
        5802.556 ,  4811.888 , 10470.597 ,  9997.311 , 15456.588 ,
        2619.3455, 45699.465 , 16564.605 , 11532.103 , 13197.42  ,
       10447.624 , 35056.277 , 39353.336 ,  4253.192 ,  9740.398 ,
       14642.002 , 12545.009 , 18751.637 , 14928.171 , 13945.8

In [None]:
y_test

array([ 9724.53    ,  8547.6913  , 45702.02235 , 12950.0712  ,
        9644.2525  ,  4500.33925 ,  2198.18985 , 11436.73815 ,
        7537.1639  ,  5425.02335 ,  6753.038   , 10493.9458  ,
        7337.748   ,  4185.0979  , 18310.742   , 10702.6424  ,
       12523.6048  ,  3490.5491  ,  6457.8434  , 33475.81715 ,
       23967.38305 , 12643.3778  , 23045.56616 , 23065.4207  ,
        1674.6323  ,  4667.60765 ,  3732.6251  ,  7682.67    ,
        3756.6216  ,  8413.46305 ,  8059.6791  , 48970.2476  ,
       12979.358   , 20630.28351 , 14571.8908  ,  4137.5227  ,
        8347.1643  , 51194.55914 , 40003.33225 ,  1880.487   ,
        5458.04645 ,  2867.1196  , 20149.3229  , 47496.49445 ,
       36149.4835  , 26018.95052 , 19749.38338 ,  6940.90985 ,
        4718.20355 , 22192.43711 ,  2899.48935 , 18838.70366 ,
       23568.272   , 46255.1125  , 24227.33724 ,  3268.84665 ,
        2322.6218  ,  8827.2099  , 14478.33015 , 13112.6048  ,
        1253.936   , 46718.16325 , 13919.8229  ,  9630.

## Part 3: Evaluating the model

### R-Squared

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

In [None]:
r2

0.9000952166319387

### Adjusted R-Squared

In [None]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

In [None]:
adj_r2

0.8970093545974039

### k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
r2s = cross_val_score(estimator = model,
                      X = X,
                      y = y,
                      scoring = 'r2',
                      cv = 10)
print("Average R-Squared: {:.3f}".format(r2s.mean()))
print("Standard Deviation: {:.3f}".format(r2s.std()))

Average R-Squared: 0.859
Standard Deviation: 0.044
