# REGRESSION WITH HITTERS DATASET

**Purpose**: Salary information and career statistics for 1986 for shared baseball players' salary estimates carry out a machine learning project

In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 170)

In [None]:
df=pd.read_csv("../input/hitters-baseball-data/Hitters.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

* DELETING NULL VALUES

In [None]:
df.dropna(inplace=True)

* FEAUTURE ENGINEERING

AtBat: Number of hits with a baseball bat during the 1986-1987 season
Hits: Number of hits in the 1986-1987 season
HmRun: Most valuable hits in the 1986-1987 season
Runs: The points he earned for his team in the 1986-1987 season
RBI: Number of players a batsman had jogged when he hit
Walks: Number of mistakes made by the opposing player
Years: Player's playing time in major league (years)
CAtBat: Number of hits during a player's career
CHits: The number of hits the player has made throughout his career
CHmRun: The player's most valuable hit during his career
CRuns: Points earned by the player during his career
CRBI: The number of players the player has made during his career
CWalks: Number of mistakes the player has made to the opposing player during their career
League: A factor with A and N levels showing the league in which the player played until the end of the season
Division: A factor with levels E and W indicating the position played by the player at the end of 1986
PutOuts: Helping your teammate in-game
Assists: Number of assists made by the player in the 1986-1987 season
Errors: Player's number of errors in the 1986-1987 season
Salary: The salary of the player in the 1986-1987 season (over thousand)
NewLeague: a factor with A and N levels indicating the player's league at the start of the 1987 season

**dependent variable** : Salary

In [None]:
df["col_fea"]=df["Walks"]-df["Errors"]
df["col_fea"].describe()
df.head()
df["col_fea_2"]=[1 if i>0 else 0 for i in df["col_fea"]]

In [None]:
df["hit_rate"]=df["Hits"]/df["AtBat"]
df["hit_rate"].max()

In [None]:
df["feature"]=df["RBI"]*df["Walks"]

In [None]:
df["CAtBat_n"]=df["CAtBat"]/df["Years"]
df["CHits_n"]=df["CHits"]/df["Years"]
df["CHmRun_n"]=df["CHmRun"]/df["Years"]
df["CRuns_n"]=df['CRuns']/df["Years"]
df["CRBI_n"]=df["CRBI"]/df["Years"]
df["CWalks_n"]=df["CWalks"]/df["Years"]

In [None]:
df["n_Walks"]=df["Walks"]/df["CWalks"]
df["n_Atbat"]=df["AtBat"]/df["CAtBat"]
df["n_Hits"]=df["Hits"]/df["CHits"]
df["n_Runs"]=df["Runs"]/df['CRuns']
df["n_RBI"]=df["RBI"]/df["CRBI"]

In [None]:
df["new_feat"]=df["Runs"]-df["Errors"]
df["new_feat2"]=df["Assists"]-df["Errors"]
df['New_Feature_4'] = df['Runs'] / (df['CRuns'] * df['Years'])

In [None]:
df.loc[(df["Runs"] > df["CRuns_n"]),"new_runs"]=1
df.loc[(df["Runs"] <= df["CRuns_n"]),"new_runs"]=0

This function, needed to join two columns

In [None]:
def twice_col(df,col1,col2):
    n=df[col1].unique().tolist()
    l=len(n)
    n1=df[col2].unique().tolist()
    l1=len(n1)
    for i in n:
        for j in range(l):
            for z in n1:
                for t in range(l1):
                    df.loc[(df[col1] == n[j]) & (df[col2] == n1[t]), str(col1) + "_" + str(col2)]=str(n[j]) +"_"+ str(n1[t])


In [None]:
twice_col(df,"League","NewLeague")

In [None]:
df.reset_index(inplace=True)

In [None]:
df["League_NewLeague"].unique()
# array(['A_A', 'N_N', 'N_A', 'A_N']
df.head()
for i in range(df.shape[0]):
    if df.loc[i, "League_NewLeague"] in ['N_N','A_N']:
        df.loc[i, "status"] = 0
    else:
        df.loc[i, "status"] = 1

In [None]:
def generate(df,c,a,b,t,y,y1,y2):
    df.loc[(df[c] < a), t] = y
    df.loc[(df[c] >= a) & (df[c] < b), t] = y1
    df.loc[(df[c] >= b), t] = y2
    return df[t]

In [None]:
df["Years"].describe()
generate(df,"Years",4,11,"experience","beginner","intermediate","senior")

In [None]:
generate(df,"Errors",3,11,"condition","verygood","good","bad")
df.head()

In [None]:
twice_col(df,"experience","condition")

In [None]:
df["new_col"]=df["RBI"]*df["HmRun"]


df["New_col2"]=df["HmRun"]/df['Hits']


df["feature_t"]=df["CAtBat"]*df["CHits"]*df["CHmRun"]*df["CRuns"]*df["CRBI"]*df["CWalks"]

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):    
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
num_cols

* ENCODING

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float]
               and df[col].nunique() == 2]

In [None]:
labelencoder = LabelEncoder()

In [None]:
for i in binary_cols:
    df[i] = labelencoder.fit_transform(df[i])

In [None]:
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]

In [None]:
df=pd.get_dummies(df, columns=ohe_cols, drop_first=True)

* STANDARDIZATION

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
for col in num_cols:
        transformer = MinMaxScaler().fit(df[[col]])
        df[col] = transformer.transform(df[[col]])

* SET UP MODEL

In [None]:
from sklearn.linear_model import LinearRegression
y = df["Salary"]
X = df.drop(["Salary"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20, random_state=1)

reg_model = LinearRegression().fit(X, y)
reg_model.intercept_
reg_model.coef_

In [None]:
#Train RMSE
y_pred = reg_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
# Test RMSE
y_pred = reg_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
# Test RKARE
reg_model.score(X_test, y_test)

In [None]:
np.mean(np.sqrt(-cross_val_score(reg_model,
                                 X,
                                 y,
                                 cv=10,
                                 scoring="neg_mean_squared_error")))