In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading Data

In [None]:
# Importing the libraries necessary for the exercise.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn import preprocessing
from lightgbm import LGBMRegressor
from sklearn.metrics import *
from sklearn.model_selection import *

In [None]:
# Reading dataset
df = pd.read_csv("/kaggle/input/hitters/Hitters.csv")

In [None]:
# Looking at the first 5 rows of the data set
df.head()

## Exploratory Data Analysis

In [None]:
# First look at the dataset
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

In [None]:
corr = df.corr()
plt.figure(figsize=(18,10))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
# To examine the effect of the league the player will play next season on the salary
print("New League A: " ,df[df["NewLeague"]=="A"].agg({"Salary":"mean"}))
print("New League N: " ,df[df["NewLeague"]=="N"].agg({"Salary":"mean"}))

In [None]:
# Examining the salary effect of the league played by the player during the season
print("League= A" ,df[df["League"]=="A"].agg({"Salary":"mean"}))
print("League= N" ,df[df["League"]=="N"].agg({"Salary":"mean"}))

In [None]:
# Examining the effect of the player's position on the salary
print("Division= E" ,df[df["Division"]=="E"].agg({"Salary":"mean"}))
print("Division= W" ,df[df["Division"]=="W"].agg({"Salary":"mean"}))

In [None]:
sns.histplot(df.Salary);

In [None]:
# With this function, we were able to separate the variables in the data set as categorical and numerical.
def grab_col_names(dataframe, cat_th=10, car_th=20):
    
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, cat_but_car, num_cols, num_but_cat

In [None]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

## Outliers

In [None]:
# Setting an upper and lower limit for outliers
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.10)
    quartile3 = dataframe[variable].quantile(0.90)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
# The function that examines whether there is an outlier according to the threshold values we have determined.
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
# Replacing outliers with upper and lower limit
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in num_cols:
        replace_with_thresholds(df, col)

## Missing Values

In [None]:
msno.bar(df)
plt.show()

In [None]:
# Drop missing values
df.dropna(inplace=True)

## Feature Engineering

In [None]:
df['NEW_Hits'] = df['Hits'] / df['CHits']

df['NEW_RBI'] = df['RBI'] / df['CRBI']

df['NEW_Walks'] = df['Walks'] / df['CWalks']

df["Player_Season_Success"] = (df["AtBat"] * 4 / 100 + df["Hits"] * 10 / 100 + df["HmRun"] * 12 / 100 +
                               df["Runs"] * 12 / 100 + df["RBI"] * 10 / 100 + df["Walks"] * 12 / 100 + df["Assists"] * 10 / 100 +
                               df["PutOuts"] * 10 / 100 - df["Errors"] * 20 / 100)

df['NEW_PutOuts'] = df['PutOuts'] * df['Years']

df["Hits_Success"] = (df["Hits"] / df["AtBat"]) * 100

df["NEW_CRBI*CATBAT"] = df['CRBI'] * df['CAtBat']

df["NEW_RBI"] = df["RBI"] / df["CRBI"]

df["NEW_Chits"] = df["CHits"] / df["Years"]

df["NEW_CHmRun"] = df["CHmRun"] * df["Years"]

df["NEW_CRuns"] = df["CRuns"] / df["Years"]

df["NEW_Chits"] = df["CHits"] * df["Years"]

df["NEW_RW"] = df["RBI"] * df["Walks"]

df["NEW_RBWALK"] = df["RBI"] / df["Walks"]

df["NEW_CH_CB"] = df["CHits"] / df["CAtBat"]

df["NEW_CHm_CAT"] = df["CHmRun"] / df["CAtBat"]

## Label Encoding

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = preprocessing.LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [None]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O"
               and len(df[col].unique()) == 2]

In [None]:
for col in df.columns:
    label_encoder(df, col)

## Model

In [None]:
y = df["Salary"]
X = df.drop(["Salary"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

In [None]:
lgb_model = LGBMRegressor().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
lgb_model = LGBMRegressor()

In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1, 0.3, 0.5],
               "n_estimators": [500, 800, 1200, 2000],
               "max_depth": [3, 5, 8],
               "colsample_bytree": [1, 0.8, 0.5]}

In [None]:
lgbm_cv_model = GridSearchCV(lgb_model,
                             lgbm_params,
                             cv=10,
                             n_jobs=-1,
                             verbose=2).fit(X_train, y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm_tuned = LGBMRegressor(**lgbm_cv_model.best_params_).fit(X_train, y_train)
y_pred = lgbm_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))