In [None]:
# Library

# Firstly used libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Warnings

from warnings import filterwarnings
filterwarnings('ignore')

pd.set_option('display.max_rows', None)

# Data Preprocessing

from sklearn.neighbors import LocalOutlierFactor 
from sklearn import preprocessing

# Modeling

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor
!pip install lightgbm
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Model Tuning

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
# read the data
hitters=pd.read_csv("../input/hitters/Hitters.csv")
hitters.head()

# DATA UNDERSTANDING

In [None]:
df=hitters.copy()
print(df.shape)
df.info()

# DATA PREPROCESSING

## 1st Trial : df3

* **df1-->df2-->df3**

* drop NA values
* train-test split
* log and log(1+x) transformation on train data
* detect outliers and drop them on train data

In [None]:
#drop NA values

df1=df.dropna()
df1.shape

In [None]:
y=df1["Salary"]
X=df1.drop("Salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)
print(X_train.shape)
X_train.head()

In [None]:
df1=X_train.copy()

In [None]:
# understanding skewness of the features ( It is acceptable if the skewness is btween -1 and 1)
# When the value of the skewness is negative, the tail of the distribution is longer towards the left hand side of the curve.
# When the value of the skewness is positive, the tail of the distribution is longer towards the right hand side of the curve.
df1.skew(axis = 0, skipna = True) 

In [None]:
# The features which will be transformed

df1.skew(axis = 0, skipna = True)[(df1.skew(axis = 0, skipna = True) >1) | (df1.skew(axis = 0, skipna = True)< -1)]

In [None]:
df1.isin([0]).any()==True   #( HmRun, CHmRun, PutOuts, Assists, Errors features have zero values.)

In [None]:
# numpy.log1p will be used to deal with these zeros (because of CHmRun, PutOuts, Assists features' zero values)

df1[df1.isin([0]).any(axis=1)==True].loc[:, "CHmRun":"Assists"]

In [None]:
# Applying log1p transformation for right skewed features and applying exponential for left skewed features
sns.distplot(df1["CHmRun"], hist=False);

In [None]:
df1["CHmRun"]=np.log1p(df1["CHmRun"])

In [None]:
sns.distplot(df1["CRuns"], hist=False);

In [None]:
df1["CRuns"]= np.log(df1["CRuns"])

In [None]:
sns.distplot(df1["CRBI"], hist=False);

In [None]:
df1["CRBI"]= np.log(df1["CRBI"])

In [None]:
sns.distplot(df1["CWalks"], hist=False);

In [None]:
df1["CWalks"]= np.log(df1["CWalks"])

In [None]:
sns.distplot(df1["PutOuts"], hist=False);

In [None]:
df1["PutOuts"]= np.log1p(df1["PutOuts"])

In [None]:
sns.distplot(df1["Assists"], hist=False);

In [None]:
df1["Assists"]= np.log1p(df1["Assists"])

In [None]:
# Salary has skewness
print(y_train.skew(axis = 0, skipna = True))
sns.distplot(y_train, hist=False);

In [None]:
# y_train= np.log(y_train)

In [None]:
df1.head(3)

In [None]:
# get dummies

df1 =pd.get_dummies(df1,columns= ["League","Division","NewLeague"], drop_first=True)
df1.head(2)

In [None]:
numeric_df1=df1.loc[:, "AtBat":"Errors"]
cat_df1=df1.loc[:, "League_N":"NewLeague_N"]

In [None]:
df2=numeric_df1.copy()

# LOF  Outlier Detection

clf= LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
clf.fit_predict(df2)

df2_scores=clf.negative_outlier_factor_
np.sort(df2_scores)[0:10]

In [None]:
sns.boxplot(df2_scores);

In [None]:
outlier_indexes=df2.loc[df2_scores< -2]
outlier_indexes

In [None]:
# Throw away outliers from df1 and Salary feature also according to these indexes .

In [None]:
df1=df1.drop(index=[249])
df1=df1.reset_index(drop=True)

y_train=pd.DataFrame(y_train).drop(index=[249])
y_train=y_train.reset_index(drop=True)
print(df1.shape )
print(y_train.shape )

In [None]:
df1.head(2)

In [None]:
df3=pd.concat([df1,y_train], axis=1)
print(df3.shape)
df3.head()

In [None]:
# df3 is our final data frame. df1 means X train data,  y_train means y train data

In [None]:
y_train[0:5]

In [None]:
X_test.head()

In [None]:
X_test =pd.get_dummies(X_test,columns= ["League","Division","NewLeague"], drop_first=True)
X_test.head(2)

In [None]:
y_test[0:5]

In [None]:
df1.head(2)

In [None]:
# df1a (all numeric features are transformed)

In [None]:
df1a=df1.copy()
df1a["AtBat"]=np.log1p(df1a["AtBat"])
df1a["Hits"]=np.log1p(df1a["Hits"])
df1a["HmRun"]=np.log1p(df1a["HmRun"])
df1a["Runs"]=np.log1p(df1a["Runs"])
df1a["RBI"]=np.log1p(df1a["RBI"])
df1a["Walks"]=np.log1p(df1a["Walks"])
df1a["Years"]=np.log1p(df1a["Years"])
df1a["CAtBat"]=np.log1p(df1a["CAtBat"])
df1a["CHits"]=np.log1p(df1a["CHits"])
df1a["Errors"]=np.log1p(df1a["Errors"])
df1a.head(2)

## 2nd Trial : df4

* **df1-->df4**

* drop NA values
* train-test split
* log and log(1+x) transformation on train data
* detect outliers and drop them on train data
* standardization

In [None]:
df1.head(3)

In [None]:
cat_df4= df1.loc[:, "League_N":"NewLeague_N"]

In [None]:
numeric_df4=df1.loc[:, "AtBat":"Errors"]
numeric_df4.head()

In [None]:
numeric_df4_columns=numeric_df4.columns
standardized_numeric_df4=preprocessing.scale(numeric_df4)
standardized_numeric_df4=pd.DataFrame(standardized_numeric_df4, columns=numeric_df4_columns)
print(standardized_numeric_df4.shape)
standardized_numeric_df4.head(2)

In [None]:
y_train.shape

In [None]:
df4= pd.concat([standardized_numeric_df4,cat_df4], axis=1)
df4

In [None]:
# Here X_train is df4

# MODELING

## df3 modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df1,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

# df1a (all numeric features are transformed) modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df1a,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df4 modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df4,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

In [None]:
#the best model is knn for df3: 442 RMSE



# These are the version of log transformed Salary(y_train) for df3-df1a-df4

## df3 modeling

In [None]:
sns.distplot(y_test, hist=False);

In [None]:
y_test.skew()

In [None]:
y_train= np.log(y_train)



models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df1,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df1a (all numeric features are transformed) modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df1a,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df4 modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df4,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

# These are the version of log transformed Salary(y_train and y_test) for df3-df1a-df4

## df3 modeling

In [None]:
y_test= np.log(y_test)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df1,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df1a (all numeric features are transformed) modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df1a,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)

## df4 modeling

In [None]:
models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(df4,y_train)
    y_pred=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(name,rmse)