# Light GBM modeling for Hitters Data Set( RMSE: 183.9 )

In [None]:
# Library

# Firstly used libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Warnings

from warnings import filterwarnings
filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Data Preprocessing

from sklearn.neighbors import LocalOutlierFactor 
from sklearn import preprocessing

# Modeling

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor
!pip install lightgbm
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Model Tuning

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
# read the data
hitters=pd.read_csv("../input/hitters/Hitters.csv")
hitters.head()

# DATA UNDERSTANDING

In [None]:
# There are 322 observations and int-float-object types of features in this data set.

df=hitters.copy()
print(df.shape)
df.info()

In [None]:
#There are 59 null values in Hitters data set
df.isnull().sum().sum()

In [None]:
# All these NA values comes from "Salary" feature
df.isnull().sum()

In [None]:
# There are high correlated  features within themselves. However,being high correlated is not a problem in machine learning algorithms.
# In addition, there isn't high correlation between independent features and target feature(Salary).
plt.figure(figsize=(14,12))
sns.heatmap(df.corr(), annot=True, cmap="BuPu");

In [None]:
# If the missing values don't come from Salary(target feature), i would have thought to assign mean according to these results.
# Because, there seems to be a relation between categoric variables and Salary values for example there is an important differences between being E Division and W Devision.

print("New League= A" ,df[df["NewLeague"]=="A"].agg({"Salary":"mean"}))
print("New League= N" ,df[df["NewLeague"]=="N"].agg({"Salary":"mean"}))
print("League= A" ,df[df["League"]=="A"].agg({"Salary":"mean"}))
print("League= N" ,df[df["League"]=="N"].agg({"Salary":"mean"}))
print("Division= E" ,df[df["Division"]=="E"].agg({"Salary":"mean"}))
print("Division= W" ,df[df["Division"]=="W"].agg({"Salary":"mean"}))

# DATA PREPROCESSING

## 1st Trial : df3

* **df1-->df2-->df3**

* drop NA values
* log transformation
* detect outliers and drop them
* size= 322-->261

In [None]:
#drop NA values

df1=df.dropna()
df1.shape

In [None]:
# understanding skewness of the features ( It is acceptable if the skewness is btween -1 and 1)
# When the value of the skewness is negative, the tail of the distribution is longer towards the left hand side of the curve.
# When the value of the skewness is positive, the tail of the distribution is longer towards the right hand side of the curve.
df1.skew(axis = 0, skipna = True) 

In [None]:
df1.skew(axis = 0, skipna = True)[(df1.skew(axis = 0, skipna = True) >1) | (df1.skew(axis = 0, skipna = True)< -1)]

In [None]:
# Applying log transformation for right skewed features and applying exponential for left skewed features
sns.distplot(df1["CAtBat"], hist=False);

In [None]:
df1["CAtBat"]= np.log(df1["CAtBat"])

In [None]:
sns.distplot(df1["CHits"], hist=False);

In [None]:
df1["CHits"]= np.log(df1["CHits"])

In [None]:
sns.distplot(df1["CHmRun"], hist=False);

In [None]:
df1["CHmRun"]=np.log(df1["CHmRun"])

In [None]:
sns.distplot(df1["CRuns"], hist=False);

In [None]:
df1["CRuns"]= np.log(df1["CRuns"])

In [None]:
sns.distplot(df1["CRBI"], hist=False);

In [None]:
df1["CRBI"]= np.log(df1["CRBI"])

In [None]:
sns.distplot(df1["CWalks"], hist=False);

In [None]:
df1["CWalks"]= np.log(df1["CWalks"])

In [None]:
sns.distplot(df1["PutOuts"], hist=False);

In [None]:
df1["PutOuts"]= np.log(df1["PutOuts"])

In [None]:
sns.distplot(df1["Assists"], hist=False);

In [None]:
df1["Assists"]= np.log(df1["Assists"])

In [None]:
df1.head()

In [None]:
# get dummies

df1 =pd.get_dummies(df1,columns= ["League","Division","NewLeague"], drop_first=True)
df1.head(2)

In [None]:
numeric_df1=df1.loc[:, "AtBat":"Errors"]
cat_df1=df1.loc[:, "League_N":"NewLeague_N"]
y_df1= df1["Salary"]

In [None]:
y_df1

In [None]:
numeric_df1.isin(['-inf']).any()==True

In [None]:
numeric_df1[numeric_df1["CHmRun"].astype("str").str.get(1)=="i"].index

In [None]:
numeric_df1[numeric_df1["CHmRun"].astype("str").str.get(1)=="i"]

In [None]:
numeric_df1["CHmRun"].describe()

In [None]:
# assign median to infinite values in CHmRun

numeric_df1.loc[[7, 188, 239],"CHmRun"]=3.688879
numeric_df1["CHmRun"].describe()

In [None]:
numeric_df1[numeric_df1["PutOuts"].astype("str").str.get(1)=="i"].index

In [None]:
numeric_df1["PutOuts"].describe()

In [None]:
# assign median to infinite values in PutOuts

numeric_df1.loc[[9, 65, 132, 149, 186, 196, 198, 207, 249, 251, 267],"PutOuts"]=5.411646
numeric_df1["PutOuts"].describe()

In [None]:
numeric_df1[numeric_df1["Assists"].astype("str").str.get(1)=="i"].index

In [None]:
numeric_df1["Assists"].describe()

In [None]:
# assign median to infinite values in Assists

numeric_df1.loc[[9, 65, 132, 149, 176, 186, 196, 198, 207, 249, 251, 255, 267, 304],"Assists"]=3.806662
numeric_df1["Assists"].describe()

In [None]:
df2=numeric_df1.copy()

# LOF  Outlier Detection

clf= LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
clf.fit_predict(df2)

df2_scores=clf.negative_outlier_factor_
np.sort(df2_scores)[0:10]

In [None]:
sns.boxplot(df2_scores);

In [None]:
outlier_indexes=df2.loc[df2_scores< -1.73878565]
outlier_indexes

In [None]:
# Throw away outliers from Salary feature also according to these indexes .

In [None]:
y_df1=pd.DataFrame(y_df1).drop(index=[217,295])
y_df1=y_df1.reset_index(drop=True)
print(y_df1.shape )
y_df1.head(2)

In [None]:
df2= df2.loc[df2_scores> -1.73878565]
df2=df2.reset_index(drop=True)
print(df2.shape)
df2.head(2)

In [None]:
# Throw away outliers from dummies also according to these indexes .

In [None]:
cat_df1=pd.DataFrame(cat_df1).drop(index=[217,295])
cat_df1=cat_df1.reset_index(drop=True)
print(cat_df1.shape)
cat_df1.head(2)

In [None]:
y_df1

In [None]:
df3= pd.concat([df2,y_df1,cat_df1], axis=1)
df3

## 2nd Trial : df4

* **df2-->df10-->df4**

* drop NA values
* log transformation
* detect outliers and drop them
* standardize df2
* size= 322-->261

In [None]:
df2.head(2)

In [None]:
df2_columns=df2.columns
standardized_df2=preprocessing.scale(df2)
standardized_df2=pd.DataFrame(standardized_df2, columns=df2_columns)
standardized_df2.head(2)

In [None]:
y_df10=hitters.dropna()["Salary"]
y_df10=pd.DataFrame(y_df10).drop(index=[217,295])
y_df10=y_df10.reset_index(drop=True)
y_df10.shape

In [None]:
cat_df10=cat_df1.reset_index(drop=True)

In [None]:
df4= pd.concat([standardized_df2,y_df10,cat_df10], axis=1)
df4

## 3rd Trial : df5

* **df4-->df5**

* drop NA values
* log transformation
* detect outliers and drop them
* standardize df2
* generating variables
* size= 322-->261

In [None]:
df5=df4.copy()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df5.corr(), annot=True, cmap=plt.cm.Reds);

In [None]:
df5["walks/cwalks"]= df5["Walks"]/df5["CWalks"]
df5["CAtBat/Years"]= df5["CAtBat"]/df5["Years"]
df5["CHits/Years"]= df5["CHits"]/df5["Years"]
df5["CHmRun/Years"]= df5["CHmRun"]/df5["Years"]
df5["Hits/CHits"]= df5["Hits"]/df5["CHits"]
df5["Assists/Errors"]= df5["Assists"]/df5["Errors"]
df5["CHits/CRBI"]= df5["CHits"]/df5["CRBI"]
df5["HmRun/CHmRun"]= df5["HmRun"]/df5["CHmRun"]
df5["CRBI/RBI"]= df5["CRBI"]/df5["RBI"]
df5["CRuns/CHits"]= df5["CRuns"]/df5["CHits"]
df5["AtBat/PutOuts"]= df5["AtBat"]/df5["PutOuts"]
df5["Walks/Years"]=df5["Walks"]/df5["Years"]

plt.figure(figsize=(14,12))
sns.heatmap(df5.corr(), annot=True, cmap=plt.cm.Blues);

## 4th Trial : df6

* **df3-->df6**

* drop NA values
* log transformation
* detect outliers and drop them
* generating variables
* size= 322-->261

In [None]:
df6=df3.copy()

df6["walks/cwalks"]= df6["Walks"]/df6["CWalks"]
df6["CAtBat/Years"]= df6["CAtBat"]/df6["Years"]
df6["CHits/Years"]= df6["CHits"]/df6["Years"]
df6["CHmRun/Years"]= df6["CHmRun"]/df6["Years"]
df6["Hits/CHits"]= df6["Hits"]/df5["CHits"]
df6["CHits/CRBI"]= df6["CHits"]/df6["CRBI"]
df6["CRBI/RBI"]= df6["CRBI"]/df6["RBI"]
df6["CRuns/CHits"]= df6["CRuns"]/df6["CHits"]
df6["AtBat/PutOuts"]= df6["AtBat"]/df6["PutOuts"]
df6["Walks/Years"]=df6["Walks"]/df6["Years"]

# MODELING

## df3 modeling

In [None]:
y=df3["Salary"]
X=df3.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)


models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df4 modeling

In [None]:
y=df4["Salary"]
X=df4.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)


models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df5 modeling

In [None]:
y=df5["Salary"]
X=df5.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)


models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df6 modeling

In [None]:
y=df6["Salary"]
X=df6.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

# TUNING

## Feature Importance

In [None]:
# LGB Feature Importance according to df6


y=df6["Salary"]
X=df6.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)

lgb_model = LGBMRegressor().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))


Importance = pd.DataFrame({'Importance':lgb_model.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', figsize=(14,12))

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

### LGB Feature Selection and LGBM : df7

In [None]:
# df7

# Feature Selection( throw away the features which are not so important for LGBM)

df7=df6.copy()
df7= df7.drop("Division_W", axis=1)
df7= df7.drop("League_N", axis=1)

In [None]:
y=df7["Salary"]
X=df7.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)

lgb_model = LGBMRegressor().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
?LGBMRegressor

# Default Parameters

boosting_type='gbdt',
    num_leaves=31,
    max_depth=-1,
    learning_rate=0.1,
    n_estimators=100,
    subsample_for_bin=200000,
    objective=None,
    class_weight=None,
    min_split_gain=0.0,
    min_child_weight=0.001,
    min_child_samples=20,
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=None,
    n_jobs=-1,
    silent=True,
    importance_type='split',
    kwargs,

In [None]:
lgbm_params= { "boosting_type" : ["dart"],
              "learning_rate": [0.09, 0.1,0.11, 0.2],
              "n_estimators": [90,100,110,150],
              "num_leaves" :[30,31,32],
              "max_depth": [7,10],
              "colsample_bytree": [1,0.8,0.5,0.4]}

In [None]:
lgbm_cv_model = GridSearchCV(lgb_model, 
                             lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose =2).fit(X_train, y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
tuned_lgbm= LGBMRegressor(**lgbm_cv_model.best_params_).fit(X_train, y_train)
y_pred= tuned_lgbm.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))


In [None]:
#Check train error to control overfitting

tuned_lgbm2= LGBMRegressor(**lgbm_cv_model.best_params_).fit(X_train, y_train)
y_pred2= tuned_lgbm.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred2))

In [None]:
# LGB Feature Importance according to final df and final model


y=df7["Salary"]
X=df7.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)

lgb_model = LGBMRegressor(**lgbm_cv_model.best_params_).fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))


Importance = pd.DataFrame({'Importance':lgb_model.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', figsize=(14,12))

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

# REPORTING


## 1. Data Understanding
> 1. Kütüphaneler import edildi.Hitters veri setindeki gözlem sayısı, değişken türleri, eksik değerler ve değişkenler arası korelasyonlar incelendi.

> 2. Salary değişkenine atama yapılabilecek bir ilişki var mı diye araştırıldı, Division kategorilerinin maaşlarında anlamlı farklılık olduğu gözlemlense de, hedef değişken olması sebebiyle atama yapılmadı.


## 2. Data Preprocessing
> 1. Veri seti,birbirinden farklı 4 ön işleme sürecinden geçti. Nihai olarak 4 data frame oluşturuldu. (df3-df4-df5-df6)

> 2. df3 -> NA değerleri atıldı.Skewness değeri -1 ile 1 arasında olmayanların sağa çarpık oldukları distplot ile gözlemlendi.

> 3. df3 -> CAtBat - CHits - CHmRun - CRuns - CRBI - CWalks - PutOuts - Assists değişkenlerine log transformation yapıldı. -∞ (-inf) gelen değerlere median ataması yapıldı.Çarpıklıkları giderildi.

> 4. df3 -> df1 veri seti, kategorik, numerik ve hedef değişkene göre split edildi. Kategorik değişkenlere dummy dönüşümü yapıldı.

> 5. df3 -> LOC ile outlierlar tespit edildi. Boxplot ile incelendi, diğer skorlardan çok uzakta kalan 2 gözlemin indexi belirlendikten sonra kategorik, numerik ve hedef değişken içeren dflerden ayrı ayrı atıldı.

> 6. df3 -> concat ile kategorik, numerik ve hedef değişken içeren df ler birleştirildi. 261 gözlem içeren nihai df oluşturuldu ve df3 ismi verildi.

> 7. df4 -> Yukarıda oluşturulan df3 standardize edildi. (mean 0, std=1)

> 8. df5 -> Yukarıda oluşturulan df4 üzerine yeni değişkenler eklendi. Bunlar: walks/cwalks, CAtBat/Years, CHits/Years , CHmRun/Years, Hits/CHits, Assists/Errors, CHits/CRBI, HmRun/CHmRun, CRBI/RBI, CRuns/CHits, AtBat/PutOuts, Walks/Years

> 9. df6 -> Yukarıda oluşturulan df3 üzerine yeni değişkenler eklendi. Bunlar: walks/cwalks, CAtBat/Years, CHits/Years , CHmRun/Years, Hits/CHits, CHits/CRBI, CRBI/RBI, CRuns/CHits, AtBat/PutOuts, Walks/Years   


## 3. Modeling

> 1. Yukarıda belirtilen 4 data frame modellere fit ettirildi. ( Regression-Ridge- Lasso-Elastic Net- KNN - CART - RF- SVR - GBM - XGBoost - LightGBM - CatBoost)

> 2. **df3 -> en iyi RMSE -> LightGBM 191.9**

> 3. **df4 -> en iyi RMSE -> LightGBM 194.7**

> 4. **df5 -> en iyi RMSE -> CatBoost 198.8**

> 5. **df6 -> en iyi RMSE -> LightGBM 189.4**

> 6. 4 data setinde genel olarak Light GBM ile daha düşük hata elde edildi. En iyi sonuç ise 189 ile df6ya fit edilen Light GBM ile elde edildi. 


## 4. Tuning

> 1. Light GBM feature selection ile önemi çok az olan 2 değişkeni attım -> **df7-> LightGBM 186.8**

> 2. Hiperparametrelerden, boosting_type, learning_rate, n_estimators, num_leaves, max_depth, colsample_bytree değerleri denenerek en iyi model bulunmaya çalışıldı -> **LightGBM 183.9**


# Sonuç:
> **Nihai RMSE 183.9 olarak hesaplandı.**

> **Train verisi üzerinde overfitting kontrolü yapıldı ve olmadığı gözlemlendi.**

> **Nihai modele göre en önemli değişkenler saptandı.**
