# CatBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Checking missing data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

### Check Duplicated

In [6]:
df.duplicated().sum()

np.int64(1)

### Handling categorical variables

Sex column

In [7]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [8]:
df['sex'] = df['sex'].apply(lambda x : 1 if x== 'male' else 0).astype(int)

In [9]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


smoker

In [10]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [11]:
df['smoker'] = df['smoker'].apply(lambda x : 1 if x == 'yes' else  0)

In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [13]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [14]:
region_dummies = pd.get_dummies(df['region'],drop_first=True).astype(int)

In [15]:
region_dummies

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [16]:
df = pd.concat([region_dummies,df],axis=1)

In [17]:
df

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,0,0,1,19,0,27.900,0,1,southwest,16884.92400
1,0,1,0,18,1,33.770,1,0,southeast,1725.55230
2,0,1,0,28,1,33.000,3,0,southeast,4449.46200
3,1,0,0,33,1,22.705,0,0,northwest,21984.47061
4,1,0,0,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...,...,...,...
1333,1,0,0,50,1,30.970,3,0,northwest,10600.54830
1334,0,0,0,18,0,31.920,0,0,northeast,2205.98080
1335,0,1,0,18,0,36.850,0,0,southeast,1629.83350
1336,0,0,1,21,0,25.800,0,0,southwest,2007.94500


In [18]:
df.drop(['region'],axis=1,inplace=True)

In [19]:
df.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,0,0,1,19,0,27.9,0,1,16884.924
1,0,1,0,18,1,33.77,1,0,1725.5523
2,0,1,0,28,1,33.0,3,0,4449.462
3,1,0,0,33,1,22.705,0,0,21984.47061
4,1,0,0,32,1,28.88,0,0,3866.8552


### Getting the inputs and output

In [20]:
X = df.iloc[:,:-1].values

In [21]:
y = df.iloc[:,-1].values

In [22]:
X

array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]], shape=(1338, 8))

In [23]:
y

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603], shape=(1338,))

### Creating the Training Set and the Test Set

In [24]:
from  sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

## Part 2 - Building and training the model

### Building the model

In [25]:
import  catboost as cb

model = cb.CatBoostRegressor()

### Training the model

In [26]:
model.fit(X_train,y_train)

Learning rate set to 0.040962
0:	learn: 11600.7116846	total: 154ms	remaining: 2m 34s
1:	learn: 11284.3727924	total: 156ms	remaining: 1m 17s
2:	learn: 10977.6498005	total: 158ms	remaining: 52.3s
3:	learn: 10664.3933334	total: 159ms	remaining: 39.6s
4:	learn: 10374.7449544	total: 161ms	remaining: 32s
5:	learn: 10084.3801900	total: 162ms	remaining: 26.8s
6:	learn: 9819.7651876	total: 163ms	remaining: 23.2s
7:	learn: 9587.4399325	total: 164ms	remaining: 20.3s
8:	learn: 9341.7134856	total: 165ms	remaining: 18.2s
9:	learn: 9110.8425161	total: 166ms	remaining: 16.5s
10:	learn: 8898.0095013	total: 167ms	remaining: 15.1s
11:	learn: 8669.7784881	total: 169ms	remaining: 13.9s
12:	learn: 8472.6834748	total: 170ms	remaining: 12.9s
13:	learn: 8280.3444223	total: 171ms	remaining: 12s
14:	learn: 8096.4278591	total: 172ms	remaining: 11.3s
15:	learn: 7906.5227780	total: 174ms	remaining: 10.7s
16:	learn: 7729.4806422	total: 175ms	remaining: 10.1s
17:	learn: 7567.0402832	total: 176ms	remaining: 9.6s
18:	l

<catboost.core.CatBoostRegressor at 0x1c351a0cfe0>

### Inference

Making the predictions of the data points in the test set

In [27]:
y_pred = model.predict(X_test)

In [28]:
y_pred

array([10240.80889473,  9004.09429757, 45138.2117815 , 14177.34972917,
       10830.26437678,  4959.40426628,  2724.27281244, 14171.00947077,
        8404.38322448,  7125.59201434,  5854.16770717, 13328.93138592,
        8997.01845087, 10300.64790052, 24144.51536148, 13871.55150281,
       13345.63584485,  6140.44277611,  7604.10940469, 33617.04426667,
       27248.0998454 , 14221.64852146, 11390.28984986, 25589.287193  ,
        4733.77382742,  7970.96406669,  4327.13560663,  7788.67824761,
        4124.41767703, 12063.28513489,  7281.95086969, 47625.82000509,
       15995.02332439, 11227.85806633, 18541.6603734 ,  4281.68062509,
       10782.90398192, 37969.70593894, 40861.73384832,  3584.4777628 ,
        5118.81659862,  2385.08097993, 21035.55979518, 50516.49590629,
       36418.16588937,  6994.93751856, 13366.62910877,  8164.69960342,
        5469.30883037, 13547.38721247,  3080.92379096, 14760.49213928,
       29557.15212874, 47199.43156866, 11382.94741354,  8495.73358049,
      

In [29]:
y_test

array([ 9724.53    ,  8547.6913  , 45702.02235 , 12950.0712  ,
        9644.2525  ,  4500.33925 ,  2198.18985 , 11436.73815 ,
        7537.1639  ,  5425.02335 ,  6753.038   , 10493.9458  ,
        7337.748   ,  4185.0979  , 18310.742   , 10702.6424  ,
       12523.6048  ,  3490.5491  ,  6457.8434  , 33475.81715 ,
       23967.38305 , 12643.3778  , 23045.56616 , 23065.4207  ,
        1674.6323  ,  4667.60765 ,  3732.6251  ,  7682.67    ,
        3756.6216  ,  8413.46305 ,  8059.6791  , 48970.2476  ,
       12979.358   , 20630.28351 , 14571.8908  ,  4137.5227  ,
        8347.1643  , 51194.55914 , 40003.33225 ,  1880.487   ,
        5458.04645 ,  2867.1196  , 20149.3229  , 47496.49445 ,
       36149.4835  , 26018.95052 , 19749.38338 ,  6940.90985 ,
        4718.20355 , 22192.43711 ,  2899.48935 , 18838.70366 ,
       23568.272   , 46255.1125  , 24227.33724 ,  3268.84665 ,
        2322.6218  ,  8827.2099  , 14478.33015 , 13112.6048  ,
        1253.936   , 46718.16325 , 13919.8229  ,  9630.

## Part 3: Evaluating the model

### R-Squared

In [30]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)

In [31]:
r2

0.8920645469095287

### Adjusted R-Squared

In [32]:
k = X_test.shape[1]
n = X_test.shape[0]
r2_adj = 1 - (1-r2) * (n-1)/(n-k-1)

In [33]:
r2_adj

0.8894158241343024

### k-Fold Cross Validation

In [34]:
from sklearn.model_selection import cross_val_score
r2s = cross_val_score(estimator=model,X=X,y=y,scoring='r2',cv=10)
print("Average R-Squared:{:.3f}".format(r2s.mean()))
print("Standard Deviation :{:.3f}".format(r2s.std()))

Learning rate set to 0.042162
0:	learn: 11630.6022257	total: 1.49ms	remaining: 1.49s
1:	learn: 11307.9883539	total: 2.47ms	remaining: 1.23s
2:	learn: 10981.7076431	total: 3.69ms	remaining: 1.22s
3:	learn: 10644.7115836	total: 4.82ms	remaining: 1.2s
4:	learn: 10334.1527947	total: 6.33ms	remaining: 1.26s
5:	learn: 10024.6090559	total: 7.69ms	remaining: 1.27s
6:	learn: 9741.8098619	total: 8.94ms	remaining: 1.27s
7:	learn: 9505.5418110	total: 10.2ms	remaining: 1.26s
8:	learn: 9262.2621298	total: 11.2ms	remaining: 1.24s
9:	learn: 9018.0646173	total: 12.4ms	remaining: 1.23s
10:	learn: 8779.6127618	total: 13.6ms	remaining: 1.22s
11:	learn: 8539.7230778	total: 14.8ms	remaining: 1.22s
12:	learn: 8328.4979148	total: 16.1ms	remaining: 1.22s
13:	learn: 8121.0309866	total: 17.2ms	remaining: 1.21s
14:	learn: 7922.5804569	total: 17.9ms	remaining: 1.18s
15:	learn: 7735.1125042	total: 19ms	remaining: 1.17s
16:	learn: 7552.6000591	total: 20.4ms	remaining: 1.18s
17:	learn: 7392.4440878	total: 21.6ms	rema

### Grid Search

In [35]:
from sklearn.model_selection import GridSearchCV
    # boosting_type: str = "gbdt",
    # num_leaves: int = 31,
    # max_depth: int = -1,
    # learning_rate: float = 0.1,
    # n_estimators: int = 100,
    # subsample_for_bin: int = 200000,
    # objective: _LGBM_ScikitCustomObjectiveFunction | str | None = None,
    # class_weight: Dict | str | None = None,
    # min_split_gain: float = 0,
    # min_child_weight: float = 0.001,
    # min_child_samples: int = 20,
    # subsample: float = 1,
parameters  = [{'num_leaves':[29,30,31,32,33],'learning_rate':[0.08,0.09,0.1,0.11,0.12],'n_estimators':[80,90,100,110,120]}]
grid_search = GridSearchCV(estimator=model,param_grid=parameters,scoring='r2',cv=10)
grid_search.fit(X,y)
best_r2 = grid_search.best_score_
best_parameters = grid_search.best_params_


0:	learn: 11293.9008520	total: 2.27ms	remaining: 180ms
1:	learn: 10677.0788324	total: 4.8ms	remaining: 187ms
2:	learn: 10108.2721230	total: 7.46ms	remaining: 192ms
3:	learn: 9542.9929982	total: 10.2ms	remaining: 194ms
4:	learn: 9045.7907730	total: 12ms	remaining: 180ms
5:	learn: 8592.1830517	total: 14.8ms	remaining: 183ms
6:	learn: 8179.1677370	total: 17.9ms	remaining: 186ms
7:	learn: 7845.2149698	total: 21.1ms	remaining: 190ms
8:	learn: 7517.7432647	total: 24.4ms	remaining: 192ms
9:	learn: 7203.2439277	total: 27.4ms	remaining: 192ms
10:	learn: 6913.2934589	total: 30.3ms	remaining: 190ms
11:	learn: 6633.6231985	total: 32.9ms	remaining: 186ms
12:	learn: 6397.8126285	total: 35.6ms	remaining: 183ms
13:	learn: 6175.6834056	total: 37.4ms	remaining: 176ms
14:	learn: 5984.1100479	total: 38.6ms	remaining: 167ms
15:	learn: 5813.2964896	total: 40.4ms	remaining: 161ms
16:	learn: 5650.1671919	total: 42.1ms	remaining: 156ms
17:	learn: 5506.8593168	total: 43.9ms	remaining: 151ms
18:	learn: 5367.1392

1000 fits failed out of a total of 1250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ganesh10\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ganesh10\AppData\Roaming\Python\Python312\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  

0:	learn: 11370.2146692	total: 1.75ms	remaining: 138ms
1:	learn: 10732.0256103	total: 3.52ms	remaining: 137ms
2:	learn: 10146.1625418	total: 5.36ms	remaining: 138ms
3:	learn: 9567.7220964	total: 7.57ms	remaining: 144ms
4:	learn: 9076.4622822	total: 9.53ms	remaining: 143ms
5:	learn: 8592.3710798	total: 11.2ms	remaining: 138ms
6:	learn: 8168.3869702	total: 13ms	remaining: 136ms
7:	learn: 7818.3297444	total: 14.2ms	remaining: 128ms
8:	learn: 7461.2518256	total: 15.6ms	remaining: 123ms
9:	learn: 7133.8873472	total: 17ms	remaining: 119ms
10:	learn: 6842.6651252	total: 18.3ms	remaining: 115ms
11:	learn: 6565.6275421	total: 19.7ms	remaining: 112ms
12:	learn: 6326.6615018	total: 21.3ms	remaining: 110ms
13:	learn: 6104.6167766	total: 22.9ms	remaining: 108ms
14:	learn: 5921.6594334	total: 24.5ms	remaining: 106ms
15:	learn: 5750.4450116	total: 25.9ms	remaining: 104ms
16:	learn: 5599.4759093	total: 27.5ms	remaining: 102ms
17:	learn: 5456.1751625	total: 29.1ms	remaining: 100ms
18:	learn: 5345.54000

In [36]:
print("Best R-Squared: {:.2f} %".format(best_r2 * 100))
print("Best Parameters:",best_parameters)

Best R-Squared: 85.70 %
Best Parameters: {'learning_rate': 0.08, 'n_estimators': 80, 'num_leaves': 31}
