In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from warnings import filterwarnings
filterwarnings('ignore')



import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import model_selection
from sklearn.neighbors import LocalOutlierFactor, KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Index <br>

 1) [Verinin Yüklenmesi ve Veriye Ön Bakış](#1) <br>
 2) [Değişkenler Arasındaki İlişki](#2) <br>
 3) [Boş Değerlerin Bulunması ve Doldurulması, Aykırı Değerlerin Baskılanması](#3) <br>
 4) [Analiz](#4) <br>
 &emsp; A) [Doğrusal Modeller](#4.0)<br>
 &emsp; 4.1) [Basit Doğrusal Regresyon Modeli](#4.1) <br>
 &emsp; 4.2) [Çoklu Doğrıusal Regresyon Modeli](#4.2) <br>
 &emsp; 4.3) [PCR Modeli](#4.3) <br>
 &emsp; 4.4) [PLS Modeli](#4.4) <br>
 &emsp; 4.5) [Ridge Regresyon Modeli](#4.5) <br>
 &emsp; 4.6) [Lasso Modeli](#4.6) <br>
 &emsp; 4.7) [ElasticNet Modeli](#4.7) <br><br>
 &emsp; B)   [Doğrusal Olmayan Modeller](#5.0)<br>
 &emsp; 5.1) [KNN Modeli](#5.1)<br>
 &emsp; 5.2) [SVR (Destek Vektör Regresyonu) Modeli](#5.2)<br>
 &emsp; 5.3) [Doğrusal Olmayan SVR Modeli](#5.3)<br> 
 &emsp; 5.4) [Çok Katmanlı Algılayıcı Modeli](#5.4)<br> 
 &emsp; 5.5) [CART Modeli](#5.5)<br>
 &emsp; 5.6) [Bagged Trees Regresyon Modeli](#5.6)<br>
 &emsp; 5.7) [Random Forests Modeli](#5.7)<br>
 &emsp; 5.8) [Gradient Boosting Machines Modeli](#5.8)<br>
 &emsp; 5.9) [XGBoost Modeli](#5.9)<br>
 &emsp; 5.10) [Light GBM Modeli](#5.10)<br>
 &emsp; 5.11) [CatBoost Modeli](#5.11)<br> 

<a id='1'></a>
## 1. Verinin Yüklenmesi

In [None]:
maindata = pd.read_csv("../input/hitters/hitters.csv")
maindatac = maindata.copy()

## Veriye Ön Bakış

In [None]:
maindata.info()

In [None]:
maindata.head()

<a id ='2'></a>
## 2. Değişkenler Arasındaki İlişki

In [None]:
maindata.describe().T

In [None]:
maindata.corr()

In [None]:
plt.subplots(figsize = (20,20))
sns.heatmap(maindata.corr(), annot = True,  fmt = ".1f");

#### > **YORUM:**  *Data seti içerisindeki verilerin birbirleriyle olan ilişkilerine baktığımızda "Hits","Runs","Walks","RBI" gibi değerlerle "CHits","CHmRun","Cruns","CRBI","CWalks" gibi değerlerin kendi aralarındaki kolerasyonun yüksek olduğunu görüyoruz.*

<a id='3'></a>
## 3.Boş Değerlerin Bulunması ve Doldurulması

In [None]:
maindata.isnull().sum()

In [None]:
maindata.groupby("League")["Salary"].mean()

In [None]:
maindatac["Salary"].fillna(maindatac.groupby("League")["Salary"].transform("mean"), inplace = True)

In [None]:
maindatac.isnull().sum()

#### > **YORUM:**  *"Salary" değişkeninde yer alan 59 boş değeri, "Salary" değişkeninin "League" değişkenine göre gruplayıp ortalamasını alarak doldurduk.*

## Aykırı Değerlerin Baskılanması

In [None]:
maindata.boxplot(column="Salary", by="League");

In [None]:
outlier = maindatac["Salary"]

Q1 = outlier.quantile(0.25)
Q3 = outlier.quantile(0.75)
IQR = Q3 - Q1

In [None]:
sns.boxplot(x = outlier)

In [None]:
low_limit = Q1 - 1.5*IQR
high_limit = Q3 + 1.5*IQR

In [None]:
(outlier < low_limit) | (outlier > high_limit)

In [None]:
outlier_tf = (outlier < low_limit) | (outlier > high_limit)
outlier_tf

In [None]:
outlier[outlier_tf] = high_limit
outlier[outlier_tf]  

In [None]:
sns.boxplot(x = outlier)

#### > **YORUM:**  *Aykırı değerleri bulduk ve baskılama yöntemi uygulayarak bunları alt limite veyahut üst limite eşitledik*

<a id='4'></a>
## 4. Analiz

In [None]:
sns.pairplot(maindatac, kind = "reg")

In [None]:
dms = pd.get_dummies(maindatac[["League", "Division", "NewLeague"]])
X_ = maindatac.drop(["Salary","League", "Division", "NewLeague"], axis=1).astype("float64")
y = maindatac["Salary"]
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

print ("X_train", X_train.shape)
print ("X_test", X_test.shape)
print ("y_train", y_train.shape)
print ("y_test", y_test.shape)

#### > **YORUM:**  *X ve y değerleriyle beraber Eğitim ve Test setlerinin oluşturduk*<br> *X: Bağımsız Değer, -tüm değişkenler-* <br> *y: Bağımlı Değer -"Salary" değişkeni*

<a id='4.0'></a>
### **A. Doğrusal Modeller** 

<a id="4.1"></a>
### 1) Basit Doğrusal Regresyon

In [None]:
X_basit = maindatac["CHits"]
X_basit = sm.add_constant(X_basit)
X_basit

# sm kütüphanesinde verilere otomatik bir sabit atanmaz. O yüzden  "X = sm.add_constant(X)" yaparak dışarıdan sabit (1) ataması gerçekleştiriyoruz. Ama smf kütüphanesi otomatik sabit atadığından bu işleme gerek kalmaz

In [None]:
y_basit = maindatac["Salary"]
y_basit[0:5]

#### Model Oluşturma

In [None]:
lm = sm.OLS(y_basit,X_basit)
model = lm.fit()
model.summary()

In [None]:
# Model Parametreleri
model.params

In [None]:
#Modelin Güven Aralığı
model.conf_int()

In [None]:
print ( "f_pvalue: ","%.4f" % model.f_pvalue)
print ( "f_value: ", "%.2f" % model.fvalue)
print ("t_value: ", "%.2f" % model.tvalues[0:1])
print ( "adj. r2: ", "%.2f" % model.rsquared_adj)

In [None]:
g_t = pd.DataFrame( {"gercek_y": y_basit[0:5],
                        
                    "tahmini_y": model.fittedvalues[0:5]})
g_t

In [None]:
print ( "Salary = " + str("%.2f" % model.params[0]) + " +  CHits*" + str("%.2f" % model.params[1]) )

> **Basit Doğrusal Regresyonun formülü :**  **Salary  = 320.17 +  CHits*0.27**

In [None]:
# "Salary" ve "Hits" arasındaki ilişki

fig = sns.regplot( maindatac["CHits"], maindatac["Salary"], scatter_kws={"color": "r","s":9} )
fig.set_title( "Model Denklemi: Salary = 320.17 +  CHits*0.27")
fig.set_ylabel("Salary")
fig.set_xlabel("CHits")


plt.ylim(bottom = 0)
plt.show()

#### Tahmin <br>
**Model :** *Salary = 320.17 +  CHits*0.27* <br>
**Soru :** *Eğer bir atıcı sezon boyunca 2000 atış yapabildiyse maaşı ne olur?*

In [None]:
X_basit = maindatac[["CHits"]]


reg = LinearRegression()
model = reg.fit(X_basit,y_basit)
model.predict([[2000]])


#### Hata Kareleri

In [None]:
# statsmodels.api kullanarak ortalamalar

lm = sm.OLS(y_basit,X_basit)
model = lm.fit()


mse = mean_squared_error(y_basit, model.fittedvalues)
rmse = np.sqrt(mse)

print("Hata Karelerinin Ortalaması: ", mse)
print("Hata Karelerinin Ortalamasının Karekökü: ", rmse)

In [None]:
# statsmodels.formula.api kullanarak ortalamalar

lm = smf.ols("Salary ~ CHits", maindatac)
model = lm.fit()


mse = mean_squared_error(y_basit, model.fittedvalues)
rmse = np.sqrt(mse)

print("Hata Karelerinin Ortalaması: ", mse)
print("Hata Karelerinin Ortalamasının Karekökü: ", rmse)


In [None]:
k_t = pd.DataFrame( {"gercek_y": y_basit[0:10],
                        
                    "tahmini_y": reg.predict(X_basit)[0:10]})
k_t

In [None]:
k_t["hata"] = k_t["gercek_y"] - k_t["tahmini_y"]
k_t

In [None]:
k_t["hata_kare"] = k_t["hata"]**2
k_t

In [None]:
print ( "Hata Karelerinin Toplamı: ", np.sum(k_t["hata_kare"]) )
print ( "Hata Karelerinin Ortalaması: ", np.mean(k_t["hata_kare"])),
print ( "Hata Karelerinin Ortalamasının Karekökü: ", np.sqrt(np.mean(k_t["hata_kare"])))


<a id = '4.2'></a>
### 4.2) Çoklu Doğrusal Regresyon

#### Modelin Oluşturulması

In [None]:
reg = LinearRegression()
model = reg.fit(X_train, y_train)
print ("Sabit Katsayı: ", model.intercept_)
print("Değişkenlerin Katsayıları: ", model.coef_)

#### Tahmin <br>
AtBat        :10 <br>
Hits         :362 <br>
HmRun        :361 <br>
Runs         :362 <br>
RBI          :360 <br>
Walks        :301 <br>
Years        :2001 <br>
CAtBat       :21 <br>
CHits        :355 <br>
CHmRun       :354 <br>
CRuns        :354 <br>
CRBI         :349 <br>
CWalks       :299 <br>
League       :0 <br>
Division     :1 <br>
PutOuts      :12 <br>
Assists      :156 <br>
Errors       :11 <br>
NewLeague    :0 <br>



In [None]:
yeni_katsayilar = [[10], [362], [361], [362], [360],[301],[2001],[21],[355],[354],[354],[349],[299],[1],[1],[12],[156],[11],[11]]
yeni_katsayilar = pd.DataFrame(yeni_katsayilar).T
model.predict(yeni_katsayilar)

#### Model Doğrulama

In [None]:
y_pred = model.predict(X_train)
y_pred_t = model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ",np.sqrt(mean_squared_error(y_test, y_pred_t)))


In [None]:
model.score(X_train, y_train)

In [None]:
r2_score(y_train, y_pred)

#### Grafik

In [None]:
tuned = RandomForestRegressor(max_depth = 1,
                             max_features = 1,
                             n_estimators = 400)
tuned.fit(X_train, y_train)
Importance = pd.DataFrame({"Importance": tuned.feature_importances_ * 100},
                        index = X_train.columns )
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("Değişken Önem Düzeyleri");

<a id='4.3'></a>
### 4.3) PCR Model

#### Modelin Oluşturulması

In [None]:
pca = PCA()
lm = LinearRegression()
X_reduced_train = pca.fit_transform(scale(X_train))
pcr_model = lm.fit(X_reduced_train, y_train)
print ( "Sabit Katsayı: ", pcr_model.intercept_)
print ("Katsayılar", pcr_model.coef_)

#### Tahmin

In [None]:
yeni_katsayilar = [[10], [10], [10], [10], [10],[10],[10],[10],[10],[10],[10],[10],[10],[10],[10],[10],[10],[10],[10]]
yeni_katsayilar = pd.DataFrame(yeni_katsayilar).T
model.predict(yeni_katsayilar)

#### Model Doğrulama

In [None]:
X_reduced_test = pca.fit_transform(scale(X_test))

y_pred = pcr_model.predict(X_reduced_train)
y_pred_t = pcr_model.predict(X_reduced_test)

print("Eğitim Seti Hata Katsayısı", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı", np.sqrt(mean_squared_error(y_test, y_pred_t)))

In [None]:
model.score(X_train, y_train)

In [None]:
r2_score(y_train, y_pred)

#### Grafik

In [None]:
cv_10 = model_selection.KFold( n_splits=10,
                               shuffle=True,
                               random_state=1 )
RMSE = []

In [None]:
for i in np.arange(1, X_reduced_train.shape[1]+1):
    score = np.sqrt( -1 * model_selection.cross_val_score( lm,
                                                           X_reduced_train[:,:i],
                                                           y_train.ravel(),
                                                           cv= cv_10,
                                                           scoring="neg_mean_squared_error" ).mean())
    RMSE.append(score)

In [None]:
plt.plot(RMSE, "-v")
plt.xlabel("Bileşen Sayısı")
plt.ylabel("RMSE")
plt.title("Maaş Tahmin Modeli İçin PCR Model Doğrulama");

<a id ="4.4"></a>
### 4.4) PLS Model

#### Modelin Oluşturulması

In [None]:
pls_model = PLSRegression().fit(X_train, y_train)

print("Değişken Katsayılar: ", pls_model.coef_)

#### Tahmin

#### Model Doğrulama

In [None]:
y_pred = pls_model.predict(X_train)
y_pred_t = pls_model.predict(X_test)

print ("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print ("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

In [None]:
#R2 Değeri
r2_score(y_train, y_pred)

In [None]:
model.score(X_train, y_train)

#### Grafik

In [None]:
cv_10 = model_selection.KFold( n_splits=10,
                               shuffle=True,
                               random_state=1 )
RMSE = []

for i in np.arange(1, X_train.shape[1]+1):
    pls = PLSRegression(n_components=i)
    score = np.sqrt( -1 * cross_val_score( pls,
                                            X_train,
                                            y_train,
                                            cv= cv_10,
                                            scoring="neg_mean_squared_error" ).mean())
    RMSE.append(score)

plt.plot(RMSE, "-v")
plt.xlabel("Bileşen Sayısı")
plt.ylabel("RMSE")
plt.title("Maaş Tahmin Modeli İçin PLS Model Doğrulama");

<a id="4.5"></a>
### 4.5) Ridge Regresyon

#### Modelin Oluşturulması

In [None]:
ridge_model = Ridge(alpha = 0.1).fit(X_train, y_train)
print("Sabit Katsayı :", ridge_model.intercept_)
print("Değişken Katsayıları :", ridge_model.coef_)

#### Tahmin

#### Model Doğrulama

In [None]:
y_pred = ridge_model.predict(X_train)
y_pred_t = ridge_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train,y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test,y_pred_t)))

In [None]:
ridge_model.score(X_train, y_train)

In [None]:
r2_score(y_train, y_pred)

#### Grafik

In [None]:
ridge = Ridge()
lambdalar = 10**np.linspace(10,-2,100)*0.5 
katsayilar = []

for i in lambdalar:
    ridge.set_params(alpha=i)
    ridge.fit(X_train, y_train)
    katsayilar.append(ridge.coef_)
    
ax = plt.gca()
ax.plot(lambdalar*2, katsayilar)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights');

<a id='4.6'></a>
### 4.6) Lasso Model

#### Modelin Oluşturulması

In [None]:
lasso_model = Lasso().fit(X_train, y_train)

print("Sabit Katsayı: ", lasso_model.intercept_)
print("Değişken Katsayılar: ", lasso_model.coef_)

#### Tahmin

#### Model Doğrulama

In [None]:
y_pred = lasso_model.predict(X_train)
y_pred_t = lasso_model.predict(X_test)

print("Eğtim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Grafik

In [None]:
lasso = Lasso()
lambdalar = 10**np.linspace(10,-2,100)*0.5 
katsayilar = []

for i in lambdalar:
    lasso.set_params(alpha=i)
    lasso.fit(X_train, y_train)
    katsayilar.append(lasso.coef_)
    
ax = plt.gca()
ax.plot(lambdalar*2, katsayilar)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights');

<a id='4.7'></a>
### 4.7) ElasticNet 

#### Modelin Oluşturulması

In [None]:
elas_model = ElasticNet().fit(X_train, y_train)

print("Sabit Katsayılar: ", elas_model.intercept_)
print("Değişken Katsayılar: ", elas_model.coef_)

#### Tahmin

#### Model Doğrulama

In [None]:
y_pred = elas_model.predict(X_train)
y_pred_t = elas_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

In [None]:
elas_model.score(X_train, y_train)

In [None]:
r2_score(y_train, y_pred)

#### Grafik

In [None]:
elastic = ElasticNet()
lambdalar = 10**np.linspace(10,-2,100)*0.5 
katsayilar = []

for i in lambdalar:
    elastic.set_params(alpha=i)
    elastic.fit(X_train, y_train)
    katsayilar.append(elastic.coef_)
    
ax = plt.gca()
ax.plot(lambdalar*2, katsayilar)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights');

> YORUM

<a id='5.0'></a>
### **B) Doğrusal Olmayan Modeller**

<a id='5.1'></a>
### 5.1) KNN Modeli

#### Modelin Oluşturulması

In [None]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

print("En Yakın Komşu Sayısı: ", knn_model.n_neighbors)

#### Tahmin

In [None]:
y_pred = knn_model.predict(X_train)
y_pred_t = knn_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
knn = KNeighborsRegressor()
knn_params = {"n_neighbors": np.arange(1,50,1)}
knn_cv_model = GridSearchCV(knn, knn_params, cv = 10)
knn_cv_model.fit(X_train, y_train)

In [None]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"])
knn_tuned.fit(X_train, y_train)


np.sqrt(mean_squared_error(y_test, knn_tuned.predict(X_test)))


<a id='5.2'></a>
### 5.2) SVR (Destek Vektör Regresyonu)

#### Modelin Oluşturulması

In [None]:
X_train_svr = pd.DataFrame(X_train["Hits"])
X_test_svr = pd.DataFrame(X_test["Hits"])

In [None]:
svr_model = SVR("linear").fit(X_train_svr, y_train)

print("Sabit Katsayı: ", svr_model.intercept_)
print("Değişken Katsayılar: ", svr_model.coef_)


#### Tahmin

In [None]:
y_pred = svr_model.predict(X_train_svr)
y_pred_t = svr_model.predict(X_test_svr)
 
print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
svr_params = {"C": np.arange(0.1,1,0.1)}
svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 10, n_jobs=-1)

svr_cv_model.fit(X_train_svr, y_train)

In [None]:
svr_tuned = SVR("linear",
               C = pd.Series(svr_cv_model.best_params_))
svr_tuned.fit(X_train_svr, y_train)

np.sqrt(mean_squared_error(y_test, svr_tuned.predict(X_test_svr)))

<a id='5.3'></a>
### 5.3) Doğrusal Olmayan SVR Modeli

#### Modelin Oluşturulması

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
svr_model_d = SVR("rbf").fit(X_train_scaled, y_train)

print("Sabit Katsayı: ", svr_model_d.intercept_)


In [None]:
y_pred = svr_model_d.predict(X_train)
y_pred_t = svr_model_d.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

In [None]:
svr_params = {"C": np.arange(1,20,1)}
svr_cv_model_d = GridSearchCV(svr_model_d, svr_params, cv = 10, n_jobs=-1)
svr_cv_model_d.fit(X_train_scaled, y_train)

In [None]:
svr_tuned = SVR("rbf",
               C = pd.Series(svr_cv_model_d.best_params_))
svr_tuned.fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, svr_tuned.predict(X_test)))

<a id='5.4'></a>
### 5.4) Çok Katmanlı Algılayıcı Modeli

#### Modelin Oluşturulması

In [None]:
mlp_model = MLPRegressor(hidden_layer_sizes=(100,20)).fit(X_train_scaled, y_train)


In [None]:
y_pred = mlp_model.predict(X_train)
y_pred_t = mlp_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

In [None]:
mlp_params = {"alpha": [10,20,30,40,50],
              "hidden_layer_sizes": (20,20),
              "activation": ["relu","logistic"]}
mlp_cv_model = GridSearchCV(mlp_model, mlp_params, cv = 10, n_jobs=-1)
mlp_cv_model.fit(X_train_scaled, y_train)

In [None]:
mlp_tuned = MLPRegressor(alpha = mlp_cv_model.best_params_["alpha"],
                        hidden_layer_sizes = mlp_cv_model.best_params_["hidden_layer_sizes"],
                        activation = mlp_cv_model.best_params_["activation"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, mlp_tuned.predict(X_test)))

<a id ='5.5'></a>
### 5.5) CART Modeli

#### Modelin Oluşturulması

In [None]:
X_train = pd.DataFrame(X_train["Hits"])
X_test = pd.DataFrame(X_test["Hits"])
cart_model = DecisionTreeRegressor(min_samples_split = 2).fit(X_train, y_train)


#### Tahmin

In [None]:
y_pred = cart_model.predict(X_train)
y_pred_t = cart_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
cart_params = {"min_samples_split": range(2,200),
               "max_leaf_nodes": range(2,10)}
cart_cv_model = GridSearchCV(cart_model, cart_params, cv = 10, n_jobs=-1)
cart_cv_model.fit(X_train, y_train)

In [None]:
cart_tuned = DecisionTreeRegressor(max_leaf_nodes= cart_cv_model.best_params_["max_leaf_nodes"],
                                   min_samples_split = cart_cv_model.best_params_["min_samples_split"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, cart_tuned.predict(X_test)))

#### Grafik

In [None]:
X_grid = np.arange(min(np.array(X_train)), max(np.array(X_train)), 2)
X_grid = X_grid.reshape((len(X_grid),1))

plt.scatter(X_train, y_train, color="red")
plt.plot(X_grid, cart_model.predict(X_grid), color ="blue")
plt.title("CART AĞAÇ MODELİ")
plt.xlabel("Diğer Faktörler")
plt.ylabel("Maaş");

<a id ='5.6'></a>
### 5.6) Bagged Trees Regresyon Modeli

#### Modelin Oluşturulması

In [None]:
bagged_model = BaggingRegressor(bootstrap_features=True).fit(X_train, y_train)


In [None]:
y_pred = bagged_model.predict(X_train)
y_pred_t = bagged_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

In [None]:
bagged_params = {"n_estimators": range(2,20)}
bagged_cv_model = GridSearchCV(bagged_model, bagged_params, cv = 10, n_jobs=-1)
bagged_cv_model.fit(X_train, y_train)

In [None]:
bagged_tuned = BaggingRegressor(n_estimators = bagged_cv_model.best_params_["n_estimators"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, bagged_tuned.predict(X_test)))

#### Grafik

In [None]:
X_grid = np.arange(min(np.array(X_train)), max(np.array(X_train)), 1)
X_grid = X_grid.reshape((len(X_grid),1))

plt.scatter(X_train, y_train, color="red")
plt.plot(X_grid, bagged_model.predict(X_grid), color ="blue")
plt.title("BAGGED AĞAÇ MODELİ")
plt.xlabel("Diğer Faktörler")
plt.ylabel("Maaş");

<a id ='5.7'></a>
### 5.7) Random Forests Modeli

In [None]:
randomf_model = RandomForestRegressor(random_state=42).fit(X_train, y_train)


#### Tahmin

In [None]:
y_pred = randomf_model.predict(X_train)
y_pred_t = randomf_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
randomf_params = {"max_depth": list(range(1,20)) ,
                  "max_features": [1,2,3,4,5,6,7,8],
                  "n_estimators": [100,200,300,400,500]}
randomf_cv_model = GridSearchCV(randomf_model, randomf_params, cv = 10, n_jobs=-1)
randomf_cv_model.fit(X_train, y_train)

In [None]:
randomf_cv_model.best_params_["max_depth"]

In [None]:
randomf_tuned = RandomForestRegressor(n_estimators = randomf_cv_model.best_params_["n_estimators"],
                                      max_depth = randomf_cv_model.best_params_["max_depth"],
                                      max_features = randomf_cv_model.best_params_["max_features"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, randomf_tuned.predict(X_test)))

#### Grafik

In [None]:
X_grid = np.arange(min(np.array(X_train)), max(np.array(X_train)), 1)
X_grid = X_grid.reshape((len(X_grid),1))

plt.scatter(X_train, y_train, color="red")
plt.plot(X_grid, randomf_model.predict(X_grid), color ="blue")
plt.title("RANDOM FORESTS MODELİ")
plt.xlabel("Diğer Faktörler")
plt.ylabel("Maaş");

<a id='5.8'></a>
### 5.8) Gradient Boosting Machines Modeli

In [None]:
gradient_model = GradientBoostingRegressor().fit(X_train, y_train)

#### Tahmin

In [None]:
y_pred = gradient_model.predict(X_train)
y_pred_t = gradient_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
gradient_params = {"max_depth": list(range(1,10)) ,
                   "n_estimators": [100,200],
                   "learning_rate": [0.1,0.2,0.3,0.4,0.5],
                   "subsample": [0.5,1,2]}
gradient_cv_model = GridSearchCV(gradient_model, gradient_params, cv = 10, n_jobs=-1)
gradient_cv_model.fit(X_train, y_train)

In [None]:
gradient_tuned = GradientBoostingRegressor(n_estimators = gradient_cv_model.best_params_["n_estimators"],
                                           max_depth = gradient_cv_model.best_params_["max_depth"],
                                           learning_rate = gradient_cv_model.best_params_["learning_rate"],
                                           subsample = gradient_cv_model.best_params_["subsample"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, gradient_tuned.predict(X_test)))

<a id='5.9'></a>
### 5.9) XGBoost Modeli

In [None]:
xgb_model = XGBRegressor().fit(X_train, y_train)

#### Tahmin

In [None]:
y_pred = xgb_model.predict(X_train)
y_pred_t = xgb_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
xgb = XGBRegressor()
xgb_params = {"colsample_bytree": [0.5,1],
              "n_estimators": [100,200,300],
              "learning_rate": [0.1,0.3,0.5],
              "max_depth": [2,4,6]}
xgb_cv_model = GridSearchCV(xgb, param_grid=xgb_params, cv = 10, verbose =2)
xgb_cv_model = xgb_cv_model.fit(X_train, y_train)

In [None]:
xgb_tuned = XGBRegressor(n_estimators = xgb_cv_model.best_params_["n_estimators"],
                                        max_depth = xgb_cv_model.best_params_["max_depth"],
                                        learning_rate = xgb_cv_model.best_params_["learning_rate"],
                                        colsample_bytree = xgb_cv_model.best_params_["colsample_bytree"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, xgb_tuned.predict(X_test)))

<a id='5.10'></a>
### 5.10) Light GBM Modeli

#### Modelin Oluşturulması

In [None]:
lgbm_model = LGBMRegressor().fit(X_train, y_train)

#### Tahmin

In [None]:
y_pred = lgbm_model.predict(X_train)
y_pred_t = lgbm_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Model Doğrulama

In [None]:
lgbm = LGBMRegressor()
lgbm_params = {"colsample_bytree": [0.3,0.5,0.7,1],
              "n_estimators": [100,200,300,400,500],
              "learning_rate": [0.1,0.2,0.3,0.4,0.5],
              "max_depth": [2,4,6]}
lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, cv = 10, n_jobs=-1, verbose =2 )
lgbm_cv_model.fit(X_train, y_train)

In [None]:
lgbm_tuned = XGBRegressor(n_estimators = lgbm_cv_model.best_params_["n_estimators"],
                                        max_depth = lgbm_cv_model.best_params_["max_depth"],
                                        learning_rate = lgbm_cv_model.best_params_["learning_rate"],
                                        colsample_bytree = lgbm_cv_model.best_params_["colsample_bytree"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, lgbm_tuned.predict(X_test)))

<a id='5.11'></a>
### 5.11) CatBoost Modeli

#### Model Oluşturma

In [None]:
cat_model = CatBoostRegressor().fit(X_train, y_train);

#### Tahmin

In [None]:
y_pred = cat_model.predict(X_train)
y_pred_t = cat_model.predict(X_test)

print("Eğitim Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_train, y_pred)))
print("Test Seti Hata Katsayısı: ", np.sqrt(mean_squared_error(y_test, y_pred_t)))

#### Modelin Doğruluğu

In [None]:
cat = CatBoostRegressor()
cat_params = {"iterations": [100,150,200,250,300],
              "learning_rate": [0.1,0.2,0.3,0.4,0.5],
              "depth": [2,4,6]}
cat_cv_model = GridSearchCV(cat, cat_params, cv = 10, n_jobs=-1, verbose = 3).fit(X_train, y_train)


In [None]:
cat_tuned = XGBRegressor(iterations = cat_cv_model.best_params_["iterations"],
                         learning_rate = cat_cv_model.best_params_["learning_rate"],
                         depth = cat_cv_model.best_params_["depth"]).fit(X_train, y_train)

np.sqrt(mean_squared_error(y_test, cat_tuned.predict(X_test)))