In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("healthcaredata_preprocessed.csv")

In [3]:
dataset.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
dataset = pd.get_dummies(dataset, drop_first = True)

In [20]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,False,True,False,False,True
1,18,33.770,1,1725.55230,True,False,False,True,False
2,28,33.000,3,4449.46200,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.880,0,3866.85520,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...
1332,50,30.970,3,10600.54830,True,False,True,False,False
1333,18,31.920,0,2205.98080,False,False,False,False,False
1334,18,36.850,0,1629.83350,False,False,False,True,False
1335,21,25.800,0,2007.94500,False,False,False,False,True


In [5]:
dataset.isna().sum()

age                 0
bmi                 0
children            0
charges             0
sex_male            0
smoker_yes          0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

### Input output split

In [6]:
independent = dataset.drop('charges', axis=1)

In [7]:
dependent = dataset['charges']

### Importing required libraries

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
# import xgboost as xgb
# import lightgbm as lgb

### Feature selection

In [9]:
from sklearn.feature_selection import SelectKBest, f_regression

# Ensure dependent is 1D
# dependent = dependent.ravel()

# Select top features using f_regression
selector = SelectKBest(score_func=f_regression, k=5)
independent_selected = selector.fit_transform(independent, dependent)

# Optional: print selected feature names
selected_columns = independent.columns[selector.get_support()]
print("Selected features:", list(selected_columns))

Selected features: ['age', 'bmi', 'children', 'smoker_yes', 'region_southeast']


In [10]:
dir(selector)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_params',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_support_mask',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_transform',
 '_validate_data',
 '_validate_params',
 

In [11]:
import pandas as pd

feature_scores = pd.DataFrame({
    'Feature': independent.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

feature_scores

Unnamed: 0,Feature,Score
4,smoker_yes,2175.736863
0,age,130.402971
1,bmi,54.702715
6,region_southeast,7.266644
2,children,6.090326
3,sex_male,4.513038
7,region_southwest,2.546984
5,region_northwest,2.001859


_Using Above score we can conclude that 'smoker_yes' is the most important feature among all the features which is also true in practical scenario_

In [14]:
def split_data(independent,dependent):
    X_train,X_test,y_train, y_test = train_test_split(independent, dependent, test_size=0.3, random_state=32)
    return X_train, X_test, y_train, y_test
    
def linear_regression(X_train,X_test,y_train):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(regressor, X_test, y_test)
    return regressor,r2scores,meansqr,meanabs
    
def Support_Vectormachine(X_train,X_test,y_train):
    svm_model = SVR(kernel='sigmoid', C=1000, gamma='auto') 
    svm_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(svm_model, X_test, y_test)
    return svm_model,r2scores,meansqr,meanabs

def random_forest(X_train,X_test,y_train):
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(rf_model, X_test, y_test)
    return rf_model,r2scores,meansqr,meanabs
    # r2scores = r2_scorecalc(regressor, X_test, y_test)
    # return r2scores

def decision_trees(X_train,X_test,y_train):
    dt_model = DecisionTreeRegressor()
    dt_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(dt_model, X_test, y_test)
    return dt_model,r2scores,meansqr,meanabs
    
def ridge(X_train,X_test,y_train):
    ridge_model = Ridge()
    ridge_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(ridge_model, X_test, y_test)
    return ridge_model,r2scores,meansqr,meanabs
    
def lasso(X_train,X_test,y_train):
    lasso_model = Lasso()
    lasso_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(lasso_model, X_test, y_test)
    return lasso_model,r2scores,meansqr,meanabs

def Xg_boost(X_train,X_test,y_train):
    import xgboost as xgb
    xg_model = xgb.XGBRegressor()
    xg_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(xg_model, X_test, y_test)
    return xg_model,r2scores,meansqr,meanabs

def gradient_boost(X_train,X_test,y_train):
    gb_model = GradientBoostingRegressor()
    gb_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(gb_model, X_test, y_test)
    return gb_model,r2scores,meansqr,meanabs
    
def ada_boost(X_train,X_test,y_train):
    ada_model = AdaBoostRegressor(random_state=0, n_estimators=100)
    ada_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(ada_model, X_test, y_test)
    return ada_model,r2scores,meansqr,meanabs
    
def lightgbm_boost(X_train,X_test,y_train):
    import lightgbm as lgb
    lightgbm_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)
    lightgbm_model.fit(X_train, y_train)
    r2scores,meansqr,meanabs = r2_scorecalc(lightgbm_model, X_test, y_test)
    return lightgbm_model,r2scores,meansqr,meanabs

#calculating r2 score
def r2_scorecalc(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 =r2_score(y_test,y_pred)
    sqr = mean_squared_error(y_test,y_pred)
    meanabsolute=mean_absolute_error(y_test,y_pred)
    return r2,sqr,meanabsolute
    
r2_scores = []
meansq_score=[]
mean_absscore=[]

# X_train,X_test,y_train, y_test = split_data(independent, dependent)


# Now split selected features
X_train, X_test, y_train, y_test = split_data(independent_selected, dependent)


model_lr,score1,score2,score3 = linear_regression(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_svm,score1,score2,score3 = Support_Vectormachine(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_rf,score1,score2,score3 = random_forest(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_dt,score1,score2,score3 = decision_trees(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_rg,score1,score2,score3 = ridge(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_ls,score1,score2,score3 = lasso(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_xg,score1,score2,score3 = Xg_boost(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_gb,score1,score2,score3 = gradient_boost(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_ada,score1,score2,score3 = ada_boost(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)
model_lg,score1,score2,score3 = lightgbm_boost(X_train,X_test,y_train)
r2_scores.append(score1)
meansq_score.append(score2)
mean_absscore.append(score3)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 311
[LightGBM] [Info] Number of data points in the train set: 935, number of used features: 5
[LightGBM] [Info] Start training from score 13063.437459




In [15]:
score_report = pd.DataFrame({
    'R2Score': r2_scores,
    'MeanSquaredError':meansq_score,
    'MeanAbsoluteError':mean_absscore
},index=['LinearRegression','SVM','RF','DT','Ridge','Lasso','Xg_boost','gradient_boost','ada_boost','LightGBM'])
score_report

Unnamed: 0,R2Score,MeanSquaredError,MeanAbsoluteError
LinearRegression,0.74267,38547720.0,4349.725769
SVM,-0.143712,171326800.0,8666.761701
RF,0.824973,26218830.0,2752.57551
DT,0.704078,44328770.0,3134.657868
Ridge,0.742989,38499970.0,4359.697975
Lasso,0.742674,38547090.0,4350.072922
Xg_boost,0.795433,30643960.0,3142.762597
gradient_boost,0.844953,23225830.0,2585.780547
ada_boost,0.789921,31469610.0,4495.484477
LightGBM,0.848951,22627000.0,2505.188252


In [13]:
# %pip install lightgbm xgboost --quiet

### saving the model

In [17]:
# Save each model
import pickle
with open('linreg_model.sav', 'wb') as f:
    pickle.dump(model_lr, f)
with open('rf_model.sav', 'wb') as f:
    pickle.dump(model_rf, f)
with open('gb_model.sav', 'wb') as f:
    pickle.dump(model_gb, f)
with open('lightgbm_model.sav', 'wb') as f:
    pickle.dump(model_lg, f)

In [21]:
X_test

array([[18.   , 23.21 ,  0.   ,  0.   ,  1.   ],
       [35.   , 17.86 ,  1.   ,  0.   ,  0.   ],
       [30.   , 22.99 ,  2.   ,  1.   ,  0.   ],
       ...,
       [56.   , 39.6  ,  0.   ,  0.   ,  0.   ],
       [26.   , 29.355,  2.   ,  0.   ,  0.   ],
       [23.   , 28.49 ,  1.   ,  1.   ,  1.   ]])