In [174]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
 
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.metrics import r2_score

In [175]:
df = pd.read_csv("data/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [176]:
df["loc_y"] =  df["region"].apply(lambda x: x[:5])
df["loc_x"] =  df["region"].apply(lambda x: x[5:])
df.drop(columns="region",inplace=True)

In [177]:
def chop_outliers(data):
    q1,q3 = np.percentile(data,[25,75])
    iqr = q3 - q1
    lower_fence = q1 -  (1.5 * iqr)
    higher_fence = q3 + (1.5 * iqr)
    return np.clip(data,lower_fence,higher_fence)


In [178]:
#for col in df.select_dtypes(exclude="object"):
#    df[col] = chop_outliers(df[col])


In [179]:
x = df.drop(columns="expenses")
y = df["expenses"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=42)
cats = x_train.select_dtypes("object").columns
nums = x_train.select_dtypes(exclude="object").columns
x_train

Unnamed: 0,age,sex,bmi,children,smoker,loc_y,loc_x
560,46,female,20.0,2,no,north,west
1285,47,female,24.3,0,no,north,east
1142,52,female,24.9,0,no,south,east
969,39,female,34.3,5,no,south,east
486,54,female,21.5,3,no,north,west
...,...,...,...,...,...,...,...
1095,18,female,31.4,4,no,north,east
1130,39,female,23.9,5,no,south,east
1294,58,male,25.2,0,no,north,east
860,37,female,47.6,2,yes,south,west


In [180]:
cat_pipe = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False)),
        ("scaler",StandardScaler())
    ]
)

nums_pipe =  Pipeline(
    steps=(
        [
            ("imputer",SimpleImputer(strategy="mean")),
            ("scaler",StandardScaler())
        ]
    )
)
preprocessor = ColumnTransformer(
   [ ("num",nums_pipe,nums),
    ("cat",cat_pipe,cats)]
)

In [181]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)



In [182]:
models = {

                "linear-regression":LinearRegression(),
                "ridge-regression":Ridge(),
                "lasso-regression":Lasso(),
                "Knn-regression":KNeighborsRegressor(),
                "Decision-tree-regressor":DecisionTreeRegressor(random_state=42),
                "Random-forest-regressor":RandomForestRegressor(random_state=42),
                "AdaBoostRegressor":AdaBoostRegressor(random_state=42),
                "GradientBoostingRegressor":GradientBoostingRegressor(random_state=42)

            }
def check_models(models):
    for name,c_model in models.items():
        model = c_model
        model.fit(x_train,y_train)
        print(F"model : {name}")
        print(f"Traing score :{model.score(x_train,y_train)}")
        print(f"Testing score :{model.score(x_test,y_test)}")
        print("....................")
check_models(models)

model : linear-regression
Traing score :0.741730377656061
Testing score :0.7833716676182348
....................
model : ridge-regression
Traing score :0.7417301298751189
Testing score :0.7833464886351187
....................
model : lasso-regression
Traing score :0.741730326632524
Testing score :0.7833544529942493
....................
model : Knn-regression
Traing score :0.8605167058229277
Testing score :0.7994894270006581
....................
model : Decision-tree-regressor
Traing score :0.9983078115527046
Testing score :0.7371613778220228
....................
model : Random-forest-regressor
Traing score :0.9747689709826177
Testing score :0.8595135140369119
....................
model : AdaBoostRegressor
Traing score :0.8143552509215202
Testing score :0.8164154963307407
....................
model : GradientBoostingRegressor
Traing score :0.8973451532390757
Testing score :0.8790886533461493
....................


In [183]:
params={    
                          "linear-regression":{},
                          "ridge-regression":{
                            'alpha':[.1,.01,0.5,.001],
                            'solver':['auto','svd','cholesky','lsqr','sag','saga'],
                            'max_iter':[100,200,300,400,500]
                          },
                          "lasso-regression":{
                            'alpha':[.1,.01,0.5,.001],
                            
                            'max_iter':[100,200,300,400,500]
                          },
                          "Knn-regression":{
                            'n_neighbors':[3,5,7,9,11],
                            'weights':['uniform','distance'],
                            'algorithm':['auto','ball_tree','kd_tree','brute'],
                            'leaf_size':[10,20,30,40,50]
                          },

                        "Decision-tree-regressor": {
                            'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                            'splitter':['best','random'],
                             'max_features':['sqrt','log2'],
                        },
                        "Random-forest-regressor":{
                             'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],

                            'max_features':['sqrt','log2',None],
                            'n_estimators': [8,16,32,64,128,256]
                        },

                        "AdaBoostRegressor":{
                            'learning_rate':[.1,.01,0.5,.001],
                             'loss':['linear','square','exponential'],
                            'n_estimators': [8,16,32,64,128,256]
                        },

                        "GradientBoostingRegressor":{
                             'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                            'learning_rate':[.1,.01,.05,.001],
                            'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                             'criterion':['squared_error', 'friedman_mse'],
                             'max_features':['auto','sqrt','log2'],
                            'n_estimators': [8,16,32,64,128,256]
                        }
                      
                    }
def evaluate_models(x_train,y_train,x_test,y_test,models,params):

    
    train_report ,test_report= {},{}
    for name,algo in models.items():
        model = algo
        param = params[name]
        grid_search = GridSearchCV(model,param,cv=3,n_jobs=-1,verbose=2)
        grid_search.fit(x_train,y_train)
        model.set_params(**grid_search.best_params_)
        
        model.fit(x_train,y_train)
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        train_model_score = r2_score(y_train,y_train_pred)
        test_model_score = r2_score(y_test,y_test_pred)
        
        train_report[name] = train_model_score
        test_report[name] = test_model_score
    return [train_report,test_report]

train_report,test_report = evaluate_models(x_train,y_train,x_test,y_test,models,params)
    
     

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV] END ...............alpha=0.1, max_iter=100, solver=auto; total time=   0.0s
[CV] END ...............alpha=0.1, max_iter=100, solver=auto; total time=   0.0s
[CV] END ................alpha=0.1, max_iter=100, solver=svd; total time=   0.0s
[CV] END ...............alpha=0.1, max_iter=100, solver=auto; total time=   0.0s
[CV] END ................alpha=0.1, max_iter=100, solver=svd; total time=   0.0s
[CV] END ................alpha=0.1, max_iter=100, solver=svd; total time=   0.0s
[CV] END ...........alpha=0.1, max_iter=100, solver=cholesky; total time=   0.0s
[CV] END ...........alpha=0.1, max_iter=100, solver=cholesky; total 

3456 fits failed out of a total of 10368.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1594 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ravi/insurence_prediction/in/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ravi/insurence_prediction/in/lib/python3.11/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/ravi/insurence_prediction/in/lib/python3.11/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/home/ravi/insurence_prediction/in/lib/python3.11/site-packages/sklearn/utils/_

In [184]:
result = pd.DataFrame([train_report,test_report]).T.reset_index()
result.columns = ["Regressor","train_score","test_score"]
result

Unnamed: 0,Regressor,train_score,test_score
0,linear-regression,0.74173,0.783372
1,ridge-regression,0.74173,0.78336
2,lasso-regression,0.74173,0.783363
3,Knn-regression,0.998308,0.813938
4,Decision-tree-regressor,0.998308,0.655265
5,Random-forest-regressor,0.976495,0.860032
6,AdaBoostRegressor,0.858369,0.860196
7,GradientBoostingRegressor,0.867536,0.879397
