# HITTERS:

# Baseball Data
Description
Major League Baseball Data from the 1986 and 1987 seasons.

# Usage
Hitters

# Format

A data frame with 322 observations of major league players on the following 20 variables.

* AtBat: Number of times at bat in 1986

* Hits: Number of hits in 1986

* HmRun: Number of home runs in 1986

* Runs: Number of runs in 1986

* RBI: Number of runs batted in in 1986

* Walks: Number of walks in 1986

* Years: Number of years in the major leagues

* CAtBat: Number of times at bat during his career

* CHits: Number of hits during his career

* CHmRun: Number of home runs during his career

* CRuns: Number of runs during his career

* CRBI: Number of runs batted in during his career

* CWalks: Number of walks during his career

* League: A factor with levels A and N indicating player's league at the end of 1986

* Division: A factor with levels E and W indicating player's division at the end of 1986

* PutOuts: Number of put outs in 1986

* Assists: Number of assists in 1986

* Errors: Number of errors in 1986

* Salary: 1987 annual salary on opening day in thousands of dollars

* NewLeague: A factor with levels A and N indicating player's league at the beginning of 1987

# Source
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. This is part of the data that was used in the 1988 ASA Graphics Section Poster Session. The salary data were originally from Sports Illustrated, April 20, 1987. The 1986 and career statistics were obtained from The 1987 Baseball Encyclopedia Update published by Collier Books, Macmillan Publishing Company, New York.

In [None]:
#####################
# Importing Library:
#####################

import warnings
warnings.simplefilter(action='ignore', category=Warning)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate,validation_curve
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV, validation_curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

In [None]:
###########
# Setting:
###########

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
############################################
# Some sets of functions for EDA processing:
############################################

#1

def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    
#2

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
#3

def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

#4

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
#5

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns
    
#6

def high_correlated_cols(dataframe, plot=False, corr_th=0.90):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (15, 15)})
        sns.heatmap(corr, cmap="RdBu")
        plt.show()
    return drop_list

# 7

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

# 8

def grab_col_names(dataframe, cat_th=10, car_th=20):
    """

    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.

    Parameters
    ------
        dataframe: dataframe
                Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
                numerik fakat kategorik olan değişkenler için sınıf eşik değeri
        car_th: int, optinal
                kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    ------
        cat_cols: list
                Kategorik değişken listesi
        num_cols: list
                Numerik değişken listesi
        cat_but_car: list
                Kategorik görünümlü kardinal değişken listesi

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = toplam değişken sayısı
        num_but_cat cat_cols'un içerisinde.
        Return olan 3 liste toplamı toplam değişken sayısına eşittir: cat_cols + num_cols + cat_but_car = değişken sayısı

    """


    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car


In [None]:
#Importing File:
df = pd.read_csv('/kaggle/input/hitters/hitters.csv')
df.head()

In [None]:
################
# EDA
################
# (322, 20)
# NewLeague;Division;League data type is "object":
# Pre-processing control-check step : There are 59 NA of salary values:

check_df(df)

# The next step is determining of categoric, numeric columns: 
cat_cols, num_cols, cat_but_car =grab_col_names(df)

# Missing value of Salary is filled via KNN imputatin method:

df_knn = df.select_dtypes(include=["float64","int64"])
imputer = KNNImputer(n_neighbors=15)
df_knn = imputer.fit_transform(df_knn)
df_knn = pd.DataFrame(df_knn,columns=num_cols)
df["Salary"] = df_knn["Salary"]

In [None]:
# Num_col histogram:
for col in num_cols:
    plt.hist(df[col], align='mid',color = "skyblue")
    plt.title(col)
    plt.show()

# Some of graphs are right - skewness based on Empirical Analysis so the values can be marked as Outliers
# Salary < 1500
# CRBI < 1500
# CRuns <1500
# CHits < 3000

df = df[(df['Salary'] < 1500) & (df['CHits']<3000) & (df["CRBI"]< 1500) & (df["CRuns"]<1500)]

In [None]:
########################################
#New Feature Engineering:
########################################

df['AtBat*RBI'] = df['AtBat'] * df['RBI']
df['Walks*Years'] = df['Walks'] * df['Years']
df['AtBat*RBI'] = df['AtBat'] * df['RBI']
df['Walks*Years'] = df['Walks'] * df['Years']
df['AtBat/Hits'] = df['AtBat'] / df['Hits']
df['AtBat/Runs'] = df['AtBat'] / df['Runs']
df['Hits/Runs'] = df['Hits'] / df['Runs']
df['HmRun/RBI'] = df['HmRun'] / df['RBI']
df['Runs/RBI'] = df['Runs'] / df['RBI']
df['Years/CAtBat'] = df['Years'] / df['CAtBat']
df['Years/CHits'] = df['Years'] / df['CHits']
df['Years/CHmRun'] = df['Years'] / df['CHmRun']
df['Years/CRuns'] = df['Years'] / df['CRuns']
df['Years/CRBI'] = df['Years'] / df['CRBI']
df['CAtBat/CHits'] = df['CAtBat'] / df['CHits']
df['CAtBat/CRuns'] = df['CAtBat'] / df['CRuns']
df['CAtBat/CRBI'] = df['CAtBat'] / df['CRBI']
df['CAtBat/CWalks'] = df['CAtBat'] / df['CWalks']
df['CHits/CRuns'] = df['CHits'] / df['CRuns']
df['CHits/CRBI'] = df['CHits'] / df['CRBI']
df['CHits/CWalks'] = df['CHits'] / df['CWalks']
df['CHmRun/CRuns'] = df['CHmRun'] / df['CRuns']
df['CHmRun/CRBI'] = df['CHmRun'] / df['CRBI']
df['CHmRun/CWalks'] = df['CHmRun'] / df['CWalks']
df['CRuns/CRBI'] = df['CRuns'] / df['CRBI']
df['CRuns/CWalks'] = df['CRuns'] / df['CWalks']
df['CHmRun/CRBI'] = df['CHmRun'] / df['CRBI']
df.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
# Determining numeric and categorical columns:
cat_cols, num_cols, cat_but_car =grab_col_names(df)

In [None]:
# Determing Outliers 

outlier_list= []
for col in num_cols:
    if check_outlier(df, col) == True:
        outlier_list.append(col)
        print(col, check_outlier(df, col))

# Check step:

outlier_list

# Replace thresholds for outlier values:
for col in num_cols:
    replace_with_thresholds(df,col)

In [None]:
# Checking for Missing Values via the function:

missing_values_table(df)

#               n_miss   ratio
#HmRun/RBI           2 0.65000
#CHmRun/CRBI         1 0.33000
#CHmRun/CWalks       1 0.33000

# A few rows are dropped.

df.dropna(subset=["HmRun/RBI","CHmRun/CWalks","CHmRun/CRBI"], axis=0, inplace=True)

In [None]:
# Controlling whether or not there is multi correlated with features:
drop_list = high_correlated_cols(df,corr_th=0.80)

# Let's control:
drop_list

# Deleted:
df.drop(drop_list, axis=1, inplace=True)

In [None]:
### Label encoding ###
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float] and df[col].nunique() == 2]
# ['League', 'Division', 'NewLeague']
for col in binary_cols:
    df = label_encoder(df, col)

In [None]:
#Robust Scale:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
num_cols.remove("Salary") #Targeted feature exlude from num_cols

for col in num_cols:
    transformer = RobustScaler().fit(df[[col]])
    df[col] = transformer.transform(df[[col]])

In [None]:
# Let's checking the DataFrame
df.shape
df.info()
df.head()

In [None]:
# The result of EDA:
y = df["Salary"]
X = df.drop(["Salary"], axis=1)

In [None]:
######################################################
# Automated Hyperparameter Optimization
######################################################

# Defining parameters:

cart_params = {'max_depth': range(1, 20),
                "min_samples_split": range(2, 30)}

rf_params = {"max_depth": [5, 8, 15, None],
                "max_features": [5, 7, "auto"],
                "min_samples_split": [8, 15, 20],
                 "n_estimators": [200, 500]}


lightgbm_params = {"learning_rate": [0.001, 0.01, 0.1],
                    "n_estimators": [500, 1000],
                    "colsample_bytree": [0.1, 0.3, 0.7, 1]}


catboost_params = {"iterations": [500,1000],
                     "learning_rate": [0.01, 0.1],
                     "depth": [3, 6]}

extraTrees_params = {
        'n_estimators': [500, 1000],
        'max_depth': [2, 16, 50],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 2],
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False],
        'warm_start': [True, False],
     }



regressors = [("CART", DecisionTreeRegressor(), cart_params),
                ("RF", RandomForestRegressor(), rf_params),
                ('LightGBM', LGBMRegressor(), lightgbm_params),
                ('Catboost',CatBoostRegressor(verbose=False),catboost_params),
                ('ExtraTrees', ExtraTreesRegressor(), extraTrees_params),
                ]
#################################################################################################

best_models = {}

for name, regressor, params in regressors:
    print(f"########## {name} ##########")
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=10, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")

    gs_best = GridSearchCV(regressor, params, cv=3, n_jobs=-1, verbose=False).fit(X, y)

    # After hyper parameters tunned
    final_model = regressor.set_params(**gs_best.best_params_)
    rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=10, scoring="neg_mean_squared_error")))
    print(f"RMSE (After): {round(rmse, 4)} ({name}) ")

    print(f"{name} best params: {gs_best.best_params_}", end="\n\n")

    best_models[name] = final_model


# The Results:

*** ########## CART ##########**
* RMSE: 287.532 (CART) 
* RMSE (After): 248.1274 (CART) 
* CART best params: {'max_depth': 5, 'min_samples_split': 17}

* ########## RF ##########
* RMSE: 193.4727 (RF) 
* RMSE (After): 190.6059 (RF) 
* RF best params: {'max_depth': 8, 'max_features': 7, 'min_samples_split': 8, 'n_estimators': 200}

*** ########## LightGBM ##########**
* RMSE: 196.0429 (LightGBM) 
* RMSE (After): 188.5556 (LightGBM) 
* LightGBM best params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'n_estimators': 500}

*** ########## Catboost ##########**
* RMSE: 185.9416 (Catboost) 
* RMSE (After): 186.1933 (Catboost) 
* Catboost best params: {'depth': 6, 'iterations': 1000, 'learning_rate': 0.01}

*** ########## ExtraTrees ##########**
* RMSE: 182.8924 (ExtraTrees) 
* RMSE (After): 178.7743 (ExtraTrees) 
* ExtraTrees best params: {'bootstrap': False, 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000, 'warm_start': True}