In [1]:
import pandas as pd
import numpy as np

In [83]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

**Data Cleaning**

In [2]:
df_baseball = pd.read_csv("Hitters.csv")

In [3]:
df_baseball.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


We have missing data in the salary column. There are several routes we could take here. If we really prioritized each data point, and we assumed that the NAs in salary are randomly distributed in the population, we could treat the observations with missing salaries as the test data and fit the model on the other data.

However, in this case, we want to tune our model using GridSearch() so we want to remove all NAs, because cross-validation uses all points from our dataset as training and test data.

In [9]:
df_baseball["Salary"].isna().sum()

59

In [12]:
# I am aware I am dropping 59 observations with dropna().

df_baseball = df_baseball.dropna(subset=["Salary"])
len(df_baseball)

263

**Part I: Different Model Specs**

A. Regression without regularization

In [19]:
X = df_baseball.drop("Salary", axis = 1)
y = df_baseball["Salary"]

In [53]:
def roee_pipeline(model=LinearRegression()):
  """
  Creates a pipeline that dummifies the categorical variables and standardizes the numerical.
  
  Parameters
  ----------
  model: a sci-kit learn function
    model specification
    
  Return
  ------
  fit pipeline
  """

  ct = ColumnTransformer(
    [
      ("dummify", 
      OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"),
      make_column_selector(dtype_include=object)),
      ("standardize", 
      StandardScaler(), 
      make_column_selector(dtype_include=np.number))
    ],
    remainder = "passthrough"
  )

  pipeline = Pipeline(
    [("preprocessing", ct),
    ("model", model)])
    
  return pipeline

In [None]:
.drop("y_var", axis=1)

In [67]:
def get_coefficients(dataset, model=LinearRegression()):
  """
  Creates a pipeline that dummifies the categorical variables and standardizes the numerical.
  
  Parameters
  ----------
  model: a sci-kit learn function
    model specification
    
  Return
  ------
  coefficients
  """
  X = dataset.drop("Salary", axis = 1)
  y = dataset["Salary"]
  
  ct = ColumnTransformer(
    [
      ("dummify", 
      OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"),
      make_column_selector(dtype_include=object)),
      ("standardize", 
      StandardScaler(), 
      make_column_selector(dtype_include=np.number))
    ],
    remainder = "passthrough"
  )

  pipeline = Pipeline(
    [("preprocessing", ct),
    ("model", model)]).set_output(transform="pandas")

  pipeline_fit = pipeline.fit(X, y)
  coefficients = pipeline.named_steps['model'].coef_
  var_names = ct.fit_transform(X).columns

  df_coef = pd.DataFrame({
    "Predictor": var_names,
    "Coefficient": coefficients
  }).sort_values(by="Coefficient", ascending=False).head(n=3)

  return df_coef

In [68]:
get_coefficients(dataset=df_baseball)

Unnamed: 0,Predictor,Coefficient
13,standardize__CRuns,480.747135
4,standardize__Hits,337.830479
14,standardize__CRBI,260.689886


In [69]:
df_baseball["CRuns"].std()

331.19857059564885

In [70]:
df_baseball["Hits"].std()

45.12532592258135

In [71]:
df_baseball["CRBI"].std()

323.3676681827309

Career runs, hits in the previous season, and career RBIs are the most significant coefficients by magnitude. 

One standard deviation increase in career runs, 331, is associated with an average increase of $480,000 in salary, holding all other variables constant.

One standard deviation increase in hits from the previous season, 45, is associated with an average increase of $337,000 in salary, holding all other variables constant.

One standard deviation increase in career RBIs, 323, is associated with an average increase of $260,000 in salary, holding all other variables constant.

In [77]:
linear_pipeline = roee_pipeline()
cross_val_score(linear_pipeline, X, y, cv = 5, scoring = "neg_mean_squared_error").mean() * -1

121136.31031816879

B. Ridge regression

In [79]:
ridge_pipeline = roee_pipeline(model=Ridge())

In [86]:
alphas = {
  'model__alpha': [0.001, 0.01, 0.1, 1, 10]
}

gscv = GridSearchCV(ridge_pipeline, alphas, cv=5, scoring=('neg_mean_squared_error' * 1))

gscv_fitted = gscv.fit(X, y)