# XGBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [183]:
import pandas as pd
import numpy as np

In [184]:
df = pd.read_csv('insurance.csv')

In [185]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Checking missing data

In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [187]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

### Check Duplicated

In [188]:
df.duplicated().sum()

np.int64(1)

### Handling categorical variables

Sex column

In [189]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [190]:
df['sex'] = df['sex'].apply(lambda x : 1 if x== 'male' else 0).astype(int)

In [191]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


smoker

In [192]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [193]:
df['smoker'] = df['smoker'].apply(lambda x : 1 if x == 'yes' else  0)

In [194]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [195]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [196]:
region_dummies = pd.get_dummies(df['region'],drop_first=True).astype(int)

In [197]:
region_dummies

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [198]:
df = pd.concat([region_dummies,df],axis=1)

In [199]:
df

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,0,0,1,19,0,27.900,0,1,southwest,16884.92400
1,0,1,0,18,1,33.770,1,0,southeast,1725.55230
2,0,1,0,28,1,33.000,3,0,southeast,4449.46200
3,1,0,0,33,1,22.705,0,0,northwest,21984.47061
4,1,0,0,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...,...,...,...
1333,1,0,0,50,1,30.970,3,0,northwest,10600.54830
1334,0,0,0,18,0,31.920,0,0,northeast,2205.98080
1335,0,1,0,18,0,36.850,0,0,southeast,1629.83350
1336,0,0,1,21,0,25.800,0,0,southwest,2007.94500


In [200]:
df.drop(['region'],axis=1,inplace=True)

In [201]:
df.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,0,0,1,19,0,27.9,0,1,16884.924
1,0,1,0,18,1,33.77,1,0,1725.5523
2,0,1,0,28,1,33.0,3,0,4449.462
3,1,0,0,33,1,22.705,0,0,21984.47061
4,1,0,0,32,1,28.88,0,0,3866.8552


### Getting the inputs and output

In [202]:
X = df.iloc[:,:-1].values

In [203]:
y = df.iloc[:,-1].values

In [204]:
X

array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]], shape=(1338, 8))

In [205]:
y

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603], shape=(1338,))

### Creating the Training Set and the Test Set

In [206]:
from  sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

## Part 2 - Building and training the model

### Building the model

In [207]:
import xgboost

model = xgboost.XGBRegressor(max_depth = 2,learning_rate = 0.15,n_estimators = 100)

### Training the model

In [208]:
model.fit(X_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


### Inference

Making the predictions of the data points in the test set

In [209]:
y_pred = model.predict(X_test)

In [210]:
y_pred

array([11827.391 ,  9767.402 , 46960.78  , 14248.177 , 11200.489 ,
        4102.413 ,  2818.4287, 13546.435 ,  9248.225 ,  7251.022 ,
        6622.1587, 12078.967 ,  9164.045 ,  5648.3354, 20058.355 ,
       11658.084 , 14058.915 ,  5748.732 ,  8089.728 , 35833.758 ,
       25949.348 , 14364.222 , 12389.957 , 24671.47  ,  3158.3093,
        7943.4946,  4159.0063,  8172.167 ,  4748.273 , 11338.687 ,
        8393.65  , 48216.1   , 15141.718 , 11687.725 , 17492.408 ,
        5222.3843, 13106.277 , 38706.25  , 38665.582 ,  2562.1018,
        4345.38  ,  4290.494 , 20590.133 , 46755.883 , 37197.793 ,
        5569.596 , 11658.084 ,  7420.917 ,  4681.164 , 12381.438 ,
        5287.9126,  4811.111 , 25672.484 , 45879.957 , 11024.6455,
        6118.865 ,  4529.0728, 10508.492 , 10435.158 , 15734.237 ,
        2307.6348, 45700.805 , 17292.734 , 11132.223 , 14814.238 ,
        9784.796 , 35126.074 , 39280.266 ,  3674.0923, 10596.132 ,
       14746.579 , 12049.567 , 19241.797 , 15135.357 , 14301.8

In [211]:
y_test

array([ 9724.53    ,  8547.6913  , 45702.02235 , 12950.0712  ,
        9644.2525  ,  4500.33925 ,  2198.18985 , 11436.73815 ,
        7537.1639  ,  5425.02335 ,  6753.038   , 10493.9458  ,
        7337.748   ,  4185.0979  , 18310.742   , 10702.6424  ,
       12523.6048  ,  3490.5491  ,  6457.8434  , 33475.81715 ,
       23967.38305 , 12643.3778  , 23045.56616 , 23065.4207  ,
        1674.6323  ,  4667.60765 ,  3732.6251  ,  7682.67    ,
        3756.6216  ,  8413.46305 ,  8059.6791  , 48970.2476  ,
       12979.358   , 20630.28351 , 14571.8908  ,  4137.5227  ,
        8347.1643  , 51194.55914 , 40003.33225 ,  1880.487   ,
        5458.04645 ,  2867.1196  , 20149.3229  , 47496.49445 ,
       36149.4835  , 26018.95052 , 19749.38338 ,  6940.90985 ,
        4718.20355 , 22192.43711 ,  2899.48935 , 18838.70366 ,
       23568.272   , 46255.1125  , 24227.33724 ,  3268.84665 ,
        2322.6218  ,  8827.2099  , 14478.33015 , 13112.6048  ,
        1253.936   , 46718.16325 , 13919.8229  ,  9630.

## Part 3: Evaluating the model

### R-Squared

In [212]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)

In [213]:
r2

0.8996195660637195

### Adjusted R-Squared

In [214]:
k = X_test.shape[1]
n = X_test.shape[0]
r2_adj = 1 - (1-r2) * (n-1)/(n-k-1)

In [215]:
r2_adj

0.8971562425315408

### k-Fold Cross Validation

In [216]:
from sklearn.model_selection import cross_val_score
r2s = cross_val_score(estimator=model,X=X,y=y,scoring='r2',cv=10)
print("Average R-Squared:{:.3f}".format(r2s.mean()))
print("Standard Deviation :{:.3f}".format(r2s.std()))

Average R-Squared:0.860
Standard Deviation :0.044


## Train two models

In [217]:
from sklearn.linear_model import LinearRegression

models =  {
    "Linear Regression": LinearRegression(),
    "XGBoost Regressor": xgboost.XGBRegressor(max_depth = 2,learning_rate = 0.15,n_estimators = 100)
}
fitted = {name:mdl.fit(X_train,y_train)  for  name,mdl in models.items()}

### Evaluate

In [218]:
def evalute(clf,Xtr,ytr,Xte,yte):
    yhat_tr = clf.predict(Xtr)
    yhat_te = clf.predict(Xte)
    matrics = {
        "R2_Train":r2_score(ytr,yhat_tr),
        "R2_Test":r2_score(yte,yhat_te),
        "R2_ADJ_Train" : 1 - (1-(r2_score(ytr,yhat_tr))) * ((Xtr.shape[0])-1)/((Xtr.shape[0]) - (Xtr.shape[1]) -1),
        "R2_ADJ_Test" : 1 - (1-(r2_score(yte,yhat_te))) * ((Xte.shape[0])-1)/((Xte.shape[0]) - (Xte.shape[1]) -1)
    }
    return matrics

results = {}
for name,clf in fitted.items():
    m = evalute(clf,X_train,y_train,X_test,y_test)
    results[name] = m
    

In [219]:
pd.DataFrame(results).T.sort_values('R2_Test', ascending=False)

Unnamed: 0,R2_Train,R2_Test,R2_ADJ_Train,R2_ADJ_Test
XGBoost Regressor,0.872269,0.89962,0.871241,0.897156
Linear Regression,0.734059,0.795879,0.731919,0.79087


com123