In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn import set_config
set_config(print_changed_only=False)

In [None]:
df=pd.read_csv("../input/insurance/insurance.csv")
df.head()

In [None]:
#Cinsiyet - sigara içme ve bölge verilerini standardizasyon işlemlerinden geçirmemiz gerekir. Bu noktada

In [None]:
ada=df.copy()

In [None]:
dmy=pd.get_dummies(ada[["sex","smoker"]])
dmy.head()

In [None]:
lbe=LabelEncoder()
ada["region"]=lbe.fit_transform(ada["region"])

In [None]:
ada.head()

In [None]:
ada["region"]

In [None]:
y=ada["charges"]

In [None]:
X=ada.drop(["sex","smoker","charges"],axis=1)

In [None]:
X=pd.concat([X,dmy[["sex_male","smoker_yes"]]],axis=1)
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
#Lineer Regresyon
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
model=lm.fit(Xtrain,ytrain)
ytahmin=model.predict(Xtest)
np.sqrt(mean_squared_error(ytest,ytahmin))

In [None]:
#KNN Modeli

In [None]:
knn_model=KNeighborsRegressor()
knn_model.fit(Xtrain,ytrain)
ytahmin=knn_model.predict(Xtest)
np.sqrt(mean_squared_error(ytest,ytahmin))

In [None]:
#pd.DataFrame({"Gerçek Test":ytest,"Tahmin Edilen":knn_model.predict(Xtest)})

In [None]:
#KNN Modelinde Model Tuning
knn_model

In [None]:
knn_parametre={"n_neighbors":np.arange(1,25,1)}
knn=KNeighborsRegressor()
knn_cv=GridSearchCV(knn,knn_parametre,cv=10)

In [None]:
knn_cv.fit(Xtrain,ytrain)

In [None]:
knn_cv.best_params_

In [None]:
knn_tuned=KNeighborsRegressor(n_neighbors=21)
knn_tuned.fit(Xtrain,ytrain)
tunedtahmin=knn_tuned.predict(Xtest)
np.sqrt(mean_squared_error(ytest,tunedtahmin))

In [None]:
#KNN modeli veri setimiz için istediğimiz kadar etkili olamadı

In [None]:
#CART Modeli
from sklearn.tree import DecisionTreeRegressor
cart_model=DecisionTreeRegressor()
cart_model.fit(Xtrain,ytrain)
ytahmin=cart_model.predict(Xtest)
np.sqrt(mean_squared_error(ytest,ytahmin))

In [None]:
#CART yapısının hiperparametre değerlerini incelemek için 
cart_model

In [None]:
#Model Tuning
cart_parametre={"min_samples_split": range(2,200),
              "max_leaf_nodes":range(2,10)}
cart=DecisionTreeRegressor()
cart_cv=GridSearchCV(cart,cart_parametre,cv=10,n_jobs=-1)
cart_cv.fit(Xtrain,ytrain)

In [None]:
cart_cv.best_params_

In [None]:
cart_tuned=DecisionTreeRegressor(max_leaf_nodes=9,min_samples_split=68)
cart_tuned.fit(Xtrain,ytrain)
tuned_tahmin=cart_tuned.predict(Xtest)
np.sqrt(mean_squared_error(ytest,tuned_tahmin))

In [None]:
#Hiperparametreler ile oynanması test verileri ile tahminde hata kareler ortalamasını 5k civarına kadar çekebildi

In [None]:
#Random Forest Regresyonu

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model=RandomForestRegressor()
rf_model.fit(Xtrain,ytrain)
ytahmin=rf_model.predict(Xtest)
np.sqrt(mean_squared_error(ytest,ytahmin))

In [None]:
rf_parametre={"max_depth":list(range(1,10)),
            "n_estimators":[200,500,700]}
random_forest=RandomForestRegressor()
rf_cv=GridSearchCV(random_forest,rf_parametre,cv=10,n_jobs=-1)

In [None]:
rf_cv.fit(Xtrain,ytrain)
rf_cv.best_params_

In [None]:
rf_tuned=RandomForestRegressor(max_depth=4,n_estimators=500)
rf_tuned.fit(Xtrain,ytrain)
tuned_tahmin=rf_tuned.predict(Xtest)
np.sqrt(mean_squared_error(ytest,tuned_tahmin))

In [None]:
#GBM-Gradient Boosting Machine
from sklearn.ensemble import GradientBoostingRegressor
gbm_model=GradientBoostingRegressor()
gbm_model.fit(Xtrain,ytrain)
ytahmin=gbm_model.predict(Xtest)
np.sqrt(mean_squared_error(ytest,ytahmin))

In [None]:
#Model Tuning 
gbm_parametre={"learning_rate":[0.001,0.01,0.1,0.2],
     "max_depth":[3,5,8,50,100],
     "n_estimators":[150,200,500,700,1200],
     "subsample":[1,0.5,0.75]}
gbm=GradientBoostingRegressor()
gbm_cv=GridSearchCV(gbm,gbm_parametre,cv=10,n_jobs=-1)
gbm_cv.fit(Xtrain,ytrain)
gbm_cv.best_params_

In [None]:
gbm_tuned=GradientBoostingRegressor(learning_rate=0.01,max_depth=3,n_estimators=500,subsample=0.75)
gbm_tuned.fit(Xtrain,ytrain)
tuned_tahmin=gbm_tuned.predict(Xtest)
np.sqrt(mean_squared_error(ytest,tuned_tahmin))

In [None]:
#Model tuning yapılırken hiperparametre seçimi çıkan sonuçlara göre değişiklik gösterebilir. Örneği yukarıda ki örnekte subsapmle için GridSearc benim verdiğim değerlerden maksimum değeri seçmiş
#Bu noktada istenirse gbm_parametre üzerinden gerekli değişiklikler yapılabilir.

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
dm_train=xgb.DMatrix(data=Xtrain,label=ytrain)
dm_test=xgb.DMatrix(data=Xtest,label=ytest)

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_model=XGBRegressor()
xgb_model.fit(Xtrain,ytrain)

In [None]:
ytahmin=xgb_model.predict(Xtest)
np.sqrt(mean_squared_error(ytest,ytahmin))

In [None]:
xgbparametre={
    "colsample_bytree":[0.4,0.5,0.6,0.9],
    "n_estimators":[100,200,500,1000],
    "max_depth":[2,3,4,5,6],
    "learning_rate":[0.1,0.01,0.5]
}
xgbcvmodel=XGBRegressor()
xgbcvmodel=GridSearchCV(xgbcvmodel,xgbparametre,cv=10,n_jobs=-1,verbose=2)
xgbcvmodel.fit(Xtrain,ytrain)

In [None]:
xgbcvmodel.best_params_

In [None]:
tuned_xgb=XGBRegressor(colsample_bytree=0.9,learning_rate=0.01,max_depth=2,n_estimators=1000)
tuned_xgb.fit(Xtrain,ytrain)
tuned_tahmin=tuned_xgb.predict(Xtest)
np.sqrt(mean_squared_error(ytest,tuned_tahmin))

In [None]:
#En iyi olasılıkları GBM ile yakaladık. Tahmin değerlerini göstermek istersek :
pd.DataFrame({"Y-Test Değerleri":ytest,"Model Tahmnileri":gbm_tuned.predict(Xtest)})