# Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')
import missingno as msno
import numpy as np
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.impute import KNNImputer
import statsmodels.api as sm
from sklearn.neighbors import LocalOutlierFactor
from lightgbm import LGBMRegressor
import re

In [None]:
#Read the Data

df=pd.read_csv("/kaggle/input/hitters-baseball-data/Hitters.csv")

In [None]:
data=df.copy()
df.head()

# DATA UNDERSTANDING

In [None]:
def data_understanding(df):
    print('############shape##############')
    print(df.shape)
    print('############types##############')
    print(df.dtypes)
    print('############head###############')
    print(df.head())
    print('############info###############')
    print(df.info())
    print('############nunique###############')
    print(df.nunique())

In [None]:
# There are 322 observations and int-float-object types of features in this data set
data_understanding(df)

In [None]:
print("Num of Object Variables:", df.select_dtypes(object).shape[1])
print("Num of Integer Variables:", df.select_dtypes("integer").shape[1])
print("Num of Float Variables:", df.select_dtypes("float").shape[1])

In [None]:
df["League"].value_counts()

In [None]:
df["League"].value_counts().plot.barh()

In [None]:
df["NewLeague"].value_counts()

In [None]:
df["NewLeague"].value_counts().plot.barh()

In [None]:
df["Division"].value_counts()

In [None]:
df["Division"].value_counts().plot.barh()

In [None]:
sns.distplot(df['Salary'])

# Data Preprocessing

In [None]:
#There are 59 null values in Hitters data set
df.isnull().sum().sum()

In [None]:
# All these NA values comes from "Salary" feature
df.isnull().sum()

In [None]:
df[df.Salary.isnull()==True].head()

In [None]:
msno.bar(df)

# Outliers

In [None]:
#Statistical view for all features
df.describe().T

In [None]:
# Descriptive Analysis
df.describe([0.05,0.25,0.50,0.75,0.95,0.99]).T

In [None]:
sns.boxplot(x = df["Salary"])
plt.show()

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
#When the quarters of 1% and quartiles of 99% were examined first, no outlier was found.
lower, upper=outlier_thresholds(df, 'Salary', q1=0.01, q3=0.99)
print(df[(df['Salary']<lower) | (df['Salary']>upper)].shape[0])

In [None]:
#Then, when the quarters of 25% and quarters of 75% were examined, an outlier was found.
#Conclusion: Observation analysis against the dependent variable is applied according to quartiles of 25 and 75. 
#Business sector information may remain untouched.
lower, upper=outlier_thresholds(df, 'Salary', q1=0.25, q3=0.75)
print(df[(df['Salary']<lower) | (df['Salary']>upper)].shape[0])

In [None]:
#Later, when quarters of 5% and quarters of 95% were examined, no outlier was found.
lower, upper=outlier_thresholds(df, 'Salary')
print(df[(df['Salary']<lower) | (df['Salary']>upper)].shape[0])

In [None]:
# numerical variables
def numeric_cols(df):
    numeric_cols = [col for col in df.columns if df[col].dtypes != "O"]
    return numeric_cols

In [None]:
#Here, how many outlier observations in all variables in quartiles of 25 and 75 are accessed.
for col in numeric_cols(df):
    lower, upper=outlier_thresholds(df, col, 0.25, 0.75)
    count=df[(df[col]<lower) | (df[col]>upper)].shape[0]
    if count!=0:
        print(col, 'yes')
        print(count)
    else:
        print(col, 'no')

In [None]:
def replace_with_thresholds(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if low_limit > 0:
        dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
    else:
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
        
    return dataframe

In [None]:
df=replace_with_thresholds(df, 'Salary')

In [None]:
sns.boxplot(df['Salary'])

In [None]:
for i in numeric_cols(df):

    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 4))
    sns.histplot(df[i], bins=10, ax=axes[0])
    axes[0].set_title(i)
    
    sns.boxplot(df[i], ax=axes[1])
    axes[1].set_title(i)
   
    sns.kdeplot(df[i], ax=axes[2])
    axes[2].set_title(i)
    plt.show()

In [None]:
# correlation analysis
df.corr()

In [None]:
def correlation(df, size=[20, 15]):
    f, ax = plt.subplots(figsize= [20,15])
    sns.heatmap(df.corr(), annot=True, fmt=".2f", ax=ax, cmap = "magma" )
    ax.set_title("Correlation Matrix", fontsize=20)
    plt.show()

In [None]:
correlation(df)

In [None]:
# Correlation analysis of numerical variables was performed.
def find_corr(df, num_col_names, limit=0.55):
    high_corrs={}
    for col in num_col_names:
        if col=='Salary':
            pass
        else:
            corr=df[[col, 'Salary']].corr().loc[col, 'Salary']
            print(col, corr)
            if abs(corr)>limit:
                high_corrs[col]=corr
    return high_corrs

In [None]:
high_corrs = find_corr(df, numeric_cols(df))

In [None]:
#Two variables with high correlation.
print(high_corrs)

In [None]:
sns.scatterplot(x= df['CRuns'], y=df.Salary)

In [None]:
sns.scatterplot(x= df['CRBI'], y=df.Salary)

# Local Outlier Factor

In [None]:
def lof_scores(df):
    clf=LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    clf.fit_predict(df)
    df_scores=clf.negative_outlier_factor_
    sns.boxplot(df_scores)
    plt.show()
    return df_scores
    
def lof(df, df_scores, threshold):
    not_outlier = df_scores >threshold
    value= df[df_scores == threshold]
    outliers = df[~not_outlier] 
    res=outliers.to_records(index=False)
    res[:] = value.to_records(index = False)
    not_outlier_df = df[not_outlier]
    outliers = pd.DataFrame(res, index = df[~not_outlier].index)
    df_res = pd.concat([not_outlier_df, outliers], ignore_index = True)
    return df_res

We will create different data sets for different scenarios that we will apply for salary estimation.

# First option

In [None]:
#drop NA values
df1=df.dropna()
df1.shape

In [None]:
#Min-Max Scaler
def minmax_scaler(dataframe, col_names, feature_range=(0,1)):
    minmax_scaler = MinMaxScaler(feature_range=feature_range)
    col_names=[col for col in col_names if col !="Salary"]
    dataframe[col_names] = minmax_scaler.fit_transform(dataframe[col_names])
    return dataframe

In [None]:
df1=minmax_scaler(df1, numeric_cols(df1))

In [None]:
df1.isnull().sum().sum()

In [None]:
# Variables with 2 categories
def var_two_cat(df):    
    bins_cols=[col for col in df.columns if df[col].dtype=='O' and df[col].nunique()==2]
    return bins_cols

In [None]:
print(var_two_cat(df1))

In [None]:
def label_encoder(df, bins_cols):
    for col in bins_cols:
        le=LabelEncoder()
        df[col]=le.fit_transform(df[col])
    return df

In [None]:
df1=label_encoder(df1, var_two_cat(df1))

In [None]:
df1.name='df1'
df1.head()

# Second option

In [None]:
#This is second option and method is fill NA values with mean
df2=df.copy()

In [None]:
#New variables were created with the most appropriate variables according to their proportions.
#The data set includes the data obtained by the players in 1986 and throughout their careers and how many years of experience they have. 
#We add the annual averages of these data and the ratio of the 1986 data to the overall performance.
def new_var(df):
    df['AtBat_new'] = df['AtBat'] / df['CAtBat']
    df['Hits_new'] = df['Hits'] / df['CHits']
    df['HmRun_new'] = (df['HmRun'] / df['CHmRun']).fillna(0)
    df['Runs_new'] = df['Runs'] / df['CRuns']
    df['RBI_new'] = (df['RBI'] / df['CRBI']).fillna(0)
    df['Walks_new'] = (df['Walks'] / df['CWalks']).fillna(0)

    df["CAtBat_rate"] = df["CAtBat"] / df["Years"]
    df["CHits_rate"] = df["CHits"] / df["Years"]
    df["CHmRun_rate"] = df["CHmRun"] / df["Years"]
    df["Cruns_rate"] = df["CRuns"] / df["Years"]
    df["CRBI_rate"] = df["CRBI"] / df["Years"]
    df["CWalks_rate"] = df["CWalks"] / df["Years"]
    
    return df

In [None]:
def new_year(df):
    df['New_Year'] = pd.cut(x=df['Years'], bins=[0, 3, 6, 10, 15, 19, 24], ).astype("O")
    return df

In [None]:
df2=new_year(df2)

In [None]:
df2['New_Year'].value_counts().plot.barh()

In [None]:
df2.isnull().sum().sum()

In [None]:
msno.bar(df2)

In [None]:
df2['Salary']=df2['Salary'].fillna(df2.groupby(['New_Year', "League", 'Division'])['Salary'].transform('mean'))

In [None]:
df2.isnull().sum().sum()

In [None]:
df2.head()

In [None]:
df2=minmax_scaler(df2, numeric_cols(df2))

In [None]:
df2=label_encoder(df2, var_two_cat(df2))

In [None]:
df2.head()

In [None]:
def one_hot_cols(df): 
    return [col for col in df.columns if 10>=df[col].nunique()>2]
print(one_hot_cols(df2))

In [None]:
df2 = pd.get_dummies(df2, columns=one_hot_cols(df2), drop_first=True)

In [None]:
df2.head()
df2.name='df2'

# Third option

In [None]:
df3=df.copy()

In [None]:
df3=new_year(df3)

In [None]:
df3=minmax_scaler(df3, numeric_cols(df3))

In [None]:
df3=label_encoder(df3, var_two_cat(df3))

In [None]:
print(one_hot_cols(df3))

In [None]:
df3 = pd.get_dummies(df3, columns=one_hot_cols(df3), drop_first=True)

In [None]:
df3.head()

In [None]:
# We fill in the missing observations with the KNN algorithm and create the dataset named 'df_knn_imp':
def knn_imputer(df, n):
    imputer = KNNImputer(n_neighbors = n)
    df_filled = imputer.fit_transform(df)
    df_knn_imp = pd.DataFrame(df_filled,columns = df.columns)
    return df_knn_imp

In [None]:
df3=knn_imputer(df3, 4)
df3.head()
df3.name='df3'

In [None]:
df3.isnull().sum().sum()

# Fourth option

In [None]:
#Filling Missing Data with KNN and Suppressing Outliers to create 'df4'
df4=df.copy()

In [None]:
df4.head()

In [None]:
df4=new_year(df4)

In [None]:
df4=minmax_scaler(df4, numeric_cols(df4))

In [None]:
df4=label_encoder(df4, var_two_cat(df4))

In [None]:
df4 = pd.get_dummies(df4, columns=one_hot_cols(df4), drop_first=True)

In [None]:
df4.head()

In [None]:
df4=knn_imputer(df4, 4)

In [None]:
array=np.sort(lof_scores(df4))

array_res=array[array>array[63]]

In [None]:
sns.boxplot(array_res)

In [None]:
df_scores=lof_scores(df4)
df4=lof(df4, df_scores, np.sort(df_scores)[63])

In [None]:
df4.isnull().sum().sum()

In [None]:
df4.name='df4'
df4.head()

# Five option

In [None]:
df5=df.copy()

In [None]:
df5=new_year(df5)
df5=new_var(df5)

In [None]:
df5=label_encoder(df5, var_two_cat(df5))

In [None]:
print(one_hot_cols(df5))

In [None]:
df5 = pd.get_dummies(df5, columns=one_hot_cols(df5), drop_first=True)

In [None]:
df5=knn_imputer(df5, 4)

In [None]:
df_scores=lof_scores(df5)

In [None]:
df5=lof(df5, df_scores, np.sort(df_scores)[110])   #90

In [None]:
df5=minmax_scaler(df5, numeric_cols(df5))

In [None]:
df5.isnull().sum().sum()

In [None]:
df5.name='df5'
df5.head()

# Model

In [None]:
def reg_model(df, Y, algo, test_size=0.20):
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=test_size, random_state=42)
    model=algo.fit(X_train, Y_train)
    Y_train_pred=model.predict(X_train)
    train_rmse=np.sqrt(mean_squared_error(Y_train, Y_train_pred))
    print(df.name)
    print(type(model).__name__)
    print("Train RMSE: {}".format(train_rmse))
    
    Y_test_pred=model.predict(X_test)
    test_rmse=np.sqrt(mean_squared_error(Y_test, Y_test_pred))
    print("Test RMSE: {}".format(test_rmse))
    print('###################################')
    return (df.name, type(model).__name__, train_rmse, test_rmse)

In [None]:
models=[LinearRegression(), Ridge(), Lasso(), ElasticNet()]
dataframes=[df1, df2, df3, df4, df5]
results={'frame':[], 'model':[], 'train_error':[], 'test_error':[]}

In [None]:
for frame in dataframes:
    for model in models:
        res=reg_model(frame, 'Salary', model)
        results['frame'].append(res[0])
        results['model'].append(res[1])
        results['train_error'].append(res[2])
        results['test_error'].append(res[3])

In [None]:
results=pd.DataFrame(results)
results

In [None]:
sns.barplot(x=results['frame'], y=results['test_error'], hue=results['model'])

# MODEL TUNING

In [None]:
def model_tuning(df, Y, algo_cv, algo, alphas, test_size=0.20, cv=10):
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42, test_size=test_size)
    model_cv=algo_cv(alphas=alphas, cv=cv)
    model_cv.fit(X_train, Y_train)
    model_tuned=algo(alpha=model_cv.alpha_)
    model_tuned.fit(X_train, Y_train)
    print(df.name)
    print(type(model_tuned).__name__)
    Y_train_pred=model_tuned.predict(X_train)
    train_rmse=np.sqrt(mean_squared_error(Y_train, Y_train_pred))
    print("Train RMSE:{}".format(train_rmse))
    Y_test_pred=model_tuned.predict(X_test)
    test_rmse=np.sqrt(mean_squared_error(Y_test, Y_test_pred))
    print("Test RMSE:{}".format(test_rmse))
    print('#####################')
    return (df.name, type(model_tuned).__name__, train_rmse, test_rmse)

In [None]:
models={Ridge: RidgeCV, Lasso:LassoCV, ElasticNet:ElasticNetCV}
results_tuned={'frame':[], 'model':[], 'train_rmse':[], 'test_rmse':[]}
alphas = [0.1,0.01, 0.005, 0.05, 0.001,0.2,0.3,0.5,0.8,0.9,1]

In [None]:
for frame in dataframes:
    for model in models:
        res=model_tuning(frame, 'Salary', models[model], model, alphas)
        results_tuned['frame'].append(res[0])
        results_tuned['model'].append(res[1])
        results_tuned['train_rmse'].append(res[2])
        results_tuned['test_rmse'].append(res[3])

In [None]:
results_tuned=pd.DataFrame(results_tuned)
results_tuned

In [None]:
def light_gbm(df, Y):
    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    lgbm=LGBMRegressor()
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42, test_size=0.20)
    lgbm.fit(X_train, Y_train)

    Y_pred=lgbm.predict(X_test,num_iteration=lgbm.best_iteration_)

    print((np.sqrt(mean_squared_error(Y_test, Y_pred))))

In [None]:
light_gbm(df4, 'Salary')

In [None]:
light_gbm(df5, 'Salary')

In [None]:
def light_gbm_tuning(df, Y):
    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    X=df.drop(Y, axis=1)
    Y=df[[Y]]
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42, test_size=0.20)
    lgbm_grid={
    'colsample_bytree':[0.4, 0.5, 0.6, 0.9, 1],
    'learning_rate':[0.01, 0.1, 0.5, 1],
           'n_estimators':[20, 40, 100, 200, 500, 1000],
           'max_depth':[1, 2, 3, 4, 5, 6, 7, 8]}

    lgbm=LGBMRegressor()

    lgbm_cv_model=GridSearchCV(lgbm, lgbm_grid, cv=10,
                           n_jobs=-1, verbose=2)

    lgbm_cv_model.fit(X_train, Y_train)

    #lgbm_cv_model.best_params_
    lgbm_tuned=LGBMRegressor(learning_rate=0.1,
                         max_depth=2,
                         n_estimators=100,
                         colsample_bytree=0.9)

    lgbm_tuned.fit(X_train, Y_train)

    Y_pred=lgbm_tuned.predict(X_test)

    print(np.sqrt(mean_squared_error(Y_test, Y_pred)))

In [None]:
light_gbm_tuning(df4, 'Salary')

In [None]:
light_gbm_tuning(df5, 'Salary')