In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, chi2

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/hitters/Hitters.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
(data.isnull().sum()/len(data))*100

In [None]:
data.dropna(inplace=True)

In [None]:
data.info()

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">EDA</p>

In [None]:
plt.figure(figsize=(15,15))
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
sns.heatmap(data.corr(), cmap=cmap, vmin=-1, vmax=1, cbar_kws={'shrink': .8}, square=True, annot=True, fmt='.2f',
            linewidths=.8)
plt.show()

In [None]:
columns = data.columns[:16]
plt.subplots(figsize=(20,30))
length = len(columns)

for i,j in zip(columns, range(length)):
    plt.subplot((length/2),3,j+1)
    plt.subplots_adjust(wspace=0.2, hspace=0.5)
    sns.distplot(data[i], kde=False, fit=stats.norm, hist_kws=dict(edgecolor="black", linewidth=2))
    plt.title(i)

In [None]:
columns = data.columns[:16]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.boxplot(y=data[i])
    plt.title(i)

In [None]:
columns = data.columns[:16]
plt.subplots(figsize=(20,30))
length = len(columns)

for i,j in zip(columns, range(length)):
    plt.subplot((length/2),3,j+1)
    plt.subplots_adjust(wspace=0.2, hspace=0.5)
    sns.regplot(x=data[i], y=data['Salary'])
    plt.title(i)

In [None]:
data2 = data[(data['CAtBat'] < 12000)]

In [None]:
data3 = data2[(data2['Years'] < 25)]

In [None]:
data4 = data3[(data3['CHmRun'] < 500)]

In [None]:
data5 = data4[(data4['CWalks'] < 1200)]

In [None]:
def encoder(data):
    le = LabelEncoder()
    for col in data.select_dtypes('object'):
        data[col] = le.fit_transform(data[col])
    return(data)

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Model</p>

In [None]:
colunas = ['Data','Modelo','RMSE']
resultado = pd.DataFrame(columns=colunas)
random_state = 5

In [None]:
data = encoder(data)
x = data.drop(columns=['Salary'])
y = data['Salary']
x = x/255

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=.2, random_state=random_state)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(x_treino,y_treino)
    y_pred=model.predict(x_teste)
    rmse = np.sqrt(mean_squared_error(y_teste, y_pred))
    resultado = resultado.append(pd.DataFrame([['Data 0',name, rmse]], columns=colunas))

In [None]:
data2 = encoder(data2)
x = data2.drop(columns=['Salary'])
y = data2['Salary']
x = x/255

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=.2 ,random_state=random_state)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(x_treino,y_treino)
    y_pred=model.predict(x_teste)
    rmse = np.sqrt(mean_squared_error(y_teste, y_pred))
    resultado = resultado.append(pd.DataFrame([['Data 2',name, rmse]], columns=colunas))

In [None]:
data3 = encoder(data3)
x = data3.drop(columns=['Salary'])
y = data3['Salary']
x = x/255

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=.2 ,random_state=random_state)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(x_treino,y_treino)
    y_pred=model.predict(x_teste)
    rmse = np.sqrt(mean_squared_error(y_teste, y_pred))
    resultado = resultado.append(pd.DataFrame([['Data 3',name, rmse]], columns=colunas))

In [None]:
data4 = encoder(data4)
x = data4.drop(columns=['Salary'])
y = data4['Salary']
x = x/255

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=.2 ,random_state=random_state)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(x_treino,y_treino)
    y_pred=model.predict(x_teste)
    rmse = np.sqrt(mean_squared_error(y_teste, y_pred))
    resultado = resultado.append(pd.DataFrame([['Data 4',name, rmse]], columns=colunas))

In [None]:
data5 = encoder(data5)
x = data5.drop(columns=['Salary'])
y = data5['Salary']
x = x/255

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=.2 ,random_state=random_state)

models = []

models.append(('Regression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


for name, model in models:
    model.fit(x_treino,y_treino)
    y_pred=model.predict(x_teste)
    rmse = np.sqrt(mean_squared_error(y_teste, y_pred))
    resultado = resultado.append(pd.DataFrame([['Data 5',name, rmse]], columns=colunas))
    
resultado.sort_values(by=['RMSE'], inplace=True)