In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
scaler = StandardScaler()
normalizer = preprocessing.Normalizer()

# 1. Get to know the data

In [2]:
# That is the list of our first and basic models
modelList={"LinearReg":LinearRegression(),"Ridge":Ridge(),"Lasso":Lasso(),"ElasticNet":ElasticNet()}

In [3]:
# lets import our data
hitters = pd.read_csv("data/Hitters.csv")
df = hitters.copy()
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [4]:
# Check the distribution of values
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AtBat,322.0,380.928571,153.404981,16.0,255.25,379.5,512.0,687.0
Hits,322.0,101.024845,46.454741,1.0,64.0,96.0,137.0,238.0
HmRun,322.0,10.770186,8.709037,0.0,4.0,8.0,16.0,40.0
Runs,322.0,50.909938,26.024095,0.0,30.25,48.0,69.0,130.0
RBI,322.0,48.02795,26.166895,0.0,28.0,44.0,64.75,121.0
Walks,322.0,38.742236,21.639327,0.0,22.0,35.0,53.0,105.0
Years,322.0,7.444099,4.926087,1.0,4.0,6.0,11.0,24.0
CAtBat,322.0,2648.68323,2324.20587,19.0,816.75,1928.0,3924.25,14053.0
CHits,322.0,717.571429,654.472627,4.0,209.0,508.0,1059.25,4256.0
CHmRun,322.0,69.490683,86.266061,0.0,14.0,37.5,90.0,548.0


In [5]:
# And basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      322 non-null    int64  
 1   Hits       322 non-null    int64  
 2   HmRun      322 non-null    int64  
 3   Runs       322 non-null    int64  
 4   RBI        322 non-null    int64  
 5   Walks      322 non-null    int64  
 6   Years      322 non-null    int64  
 7   CAtBat     322 non-null    int64  
 8   CHits      322 non-null    int64  
 9   CHmRun     322 non-null    int64  
 10  CRuns      322 non-null    int64  
 11  CRBI       322 non-null    int64  
 12  CWalks     322 non-null    int64  
 13  League     322 non-null    object 
 14  Division   322 non-null    object 
 15  PutOuts    322 non-null    int64  
 16  Assists    322 non-null    int64  
 17  Errors     322 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  322 non-null    object 
dtypes: float64

In [6]:
# Lets see how many null we have
df.isnull().sum()

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64

# 2. Manipulate data get ready for modelling

In [7]:
# It is not wise to drop null values in small data. But for now we will focus on the models and their result.
df.dropna(inplace = True)
df.shape

(263, 20)

In [8]:
# Change the categorical data to numeric data
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)

In [9]:
# Spllit data as data and label
y = df["Salary"]
X = df.drop('Salary', axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46)

In [11]:
# That method will return the a dataframe with our modellist and result(MSE) which we created at the beginning
def modelResult(data,label):
    df_model=pd.DataFrame()
    for name,model in modelList.items():
        X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=46)
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        mse=np.sqrt(mean_squared_error(y_test, y_pred))
        mseCV = np.sqrt(np.mean(-cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")))
        index=[]
        index.append(name)
        df_temp=pd.DataFrame({"MSE":mse,"MSECV":mseCV},index=index)
        df_model=df_model.append(df_temp)
    return df_model

In [12]:
# As we are going to use same steps many times lets just create a method for better understanding less confusion 
# and to focus more on the point
# That method will return MSE and MSECV(contains cross validation) based on our data and model
def meanSquaredError(model,data,label):
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=46)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    mse=np.sqrt(mean_squared_error(y_test, y_pred))
    mseCV = np.sqrt(np.mean(-cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")))
    display("MSE > "+str(mse))
    display("MSECV > "+str(mseCV))
    

# 3. Basic result for initial models

In [13]:
# The result for initial modellist
modelResult(X,y)

Unnamed: 0,MSE,MSECV
LinearReg,479.071504,307.225761
Ridge,479.225773,306.767049
Lasso,478.317972,305.898477
ElasticNet,486.183422,302.740981


In [14]:
# The result with initial model list with scalled data.
modelResult(scaler.fit_transform(X),y)

Unnamed: 0,MSE,MSECV
LinearReg,479.071504,307.225761
Ridge,458.751443,301.30845
Lasso,461.242286,301.120525
ElasticNet,463.935294,299.848428


# 4. Result for CV models

In [15]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV

In [16]:
# We will try different alpha values to find the best alpha value
alphas1 = np.random.randint(0,1000,100)
alphas2 = 10**np.linspace(10,-2,100)*0.5
alphas3 = np.linspace(0,1,1000)

In [17]:
# Method for ridgeCV
def ridgeAlpha(alphas):
    ridge_cv = RidgeCV(alphas = alphas, scoring = "neg_mean_squared_error", cv = 10, normalize = True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46)
    ridge_cv.fit(X_train,y_train)
    return ridge_cv.alpha_

In [18]:
display(ridgeAlpha(alphas1))

5

In [19]:
ridge_tuned=Ridge(alpha=ridgeAlpha(alphas1)).fit(X_train,y_train)
meanSquaredError(ridge_tuned,X,y)

'MSE > 479.9344707778003'

'MSECV > 305.814003141136'

In [20]:
ridge_tuned=Ridge(alpha=ridgeAlpha(alphas2)).fit(X_train,y_train)
meanSquaredError(ridge_tuned,X,y)

'MSE > 479.13582858479754'

'MSECV > 307.006018529552'

In [21]:
ridge_tuned=Ridge(alpha=ridgeAlpha(alphas3)).fit(X_train,y_train)
meanSquaredError(ridge_tuned,X,y)

'MSE > 479.12948330123106'

'MSECV > 307.0254782944463'

In [22]:
# Method for lassoCV
def lassoAlpha(alphas):
    lasso_cv = LassoCV(alphas = alphas, cv = 10, normalize = True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46)
    lasso_cv.fit(X_train,y_train)
    return lasso_cv.alpha_

In [23]:
lasso_tuned=Lasso(alpha=lassoAlpha(alphas1)).fit(X_train,y_train)
meanSquaredError(lasso_tuned,X,y)

'MSE > 482.8315006422545'

'MSECV > 303.01774941438657'

In [24]:
lasso_tuned=Lasso(alpha=lassoAlpha(alphas2)).fit(X_train,y_train)
meanSquaredError(lasso_tuned,X,y)

'MSE > 477.7153213422907'

'MSECV > 306.5321428787491'

In [25]:
lasso_tuned=Lasso(alpha=lassoAlpha(alphas3)).fit(X_train,y_train)
meanSquaredError(lasso_tuned,X,y)

'MSE > 477.72670212416784'

'MSECV > 306.51782879160237'

### Lets add all together

In [26]:
def dfScore(name,model,data,label):
    df_model=pd.DataFrame()
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=46)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    mse=np.sqrt(mean_squared_error(y_test, y_pred))
    mseCV = np.sqrt(np.mean(-cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")))
    index=[]
    index.append(name)
    df_temp=pd.DataFrame({"MSE":mse,"MSECV":mseCV},index=index)
    df_model=df_model.append(df_temp)
    return df_model
    
    

In [27]:
df_Models = modelResult(X,y)
df_Models = df_Models.append(dfScore("Ridge_Tuned",ridge_tuned,X,y))
df_Models = df_Models.append(dfScore("Lasso_Tuned",lasso_tuned,X,y))
df_Models

Unnamed: 0,MSE,MSECV
LinearReg,479.071504,307.225761
Ridge,479.225773,306.767049
Lasso,478.317972,305.898477
ElasticNet,486.183422,302.740981
Ridge_Tuned,479.129483,307.025478
Lasso_Tuned,477.726702,306.517829


### Lets add ElasticNet

In [28]:
from sklearn.model_selection import GridSearchCV
enet_params = {"l1_ratio": [0.1,0.2,0.4,0.5,0.6,0.8,1],
              "alpha":[0.1,0.01,0.001,0.2,0.3,0.5,0.8,0.9,1]}
enet_model = ElasticNet()

In [29]:
gs_cv_enet = GridSearchCV(enet_model, enet_params, cv=10).fit(X_train,y_train)

In [30]:
gs_cv_enet.best_params_

{'alpha': 1, 'l1_ratio': 0.1}

In [31]:
enet_tuned = ElasticNet(**gs_cv_enet.best_params_).fit(X, y)

In [32]:
df_Models = df_Models.append(dfScore("Enet_Tuned",enet_tuned,X,y))
df_Models

Unnamed: 0,MSE,MSECV
LinearReg,479.071504,307.225761
Ridge,479.225773,306.767049
Lasso,478.317972,305.898477
ElasticNet,486.183422,302.740981
Ridge_Tuned,479.129483,307.025478
Lasso_Tuned,477.726702,306.517829
Enet_Tuned,487.574519,302.279132


### Lets use scalled data with Enet_Tuned

In [33]:
df_Models = df_Models.append(dfScore("Enet_Tuned_Scalled",enet_tuned,scaler.fit_transform(X),y))
df_Models

Unnamed: 0,MSE,MSECV
LinearReg,479.071504,307.225761
Ridge,479.225773,306.767049
Lasso,478.317972,305.898477
ElasticNet,486.183422,302.740981
Ridge_Tuned,479.129483,307.025478
Lasso_Tuned,477.726702,306.517829
Enet_Tuned,487.574519,302.279132
Enet_Tuned_Scalled,463.196037,301.184559


In [34]:
# If we do changes like that MSE values will be lesser for sure. But will that be better for unseen data or not ? 
Q1 = df.Salary.quantile(0.25)
Q3 = df.Salary.quantile(0.75)
IQR = Q3-Q1
upper = Q3 + 1.5*IQR
df.loc[df["Salary"] > upper,"Salary"] = upper

In [35]:
df_Models = df_Models.append(dfScore("Enet_Tuned_Scalled2",enet_tuned,scaler.fit_transform(X),y))
df_Models

Unnamed: 0,MSE,MSECV
LinearReg,479.071504,307.225761
Ridge,479.225773,306.767049
Lasso,478.317972,305.898477
ElasticNet,486.183422,302.740981
Ridge_Tuned,479.129483,307.025478
Lasso_Tuned,477.726702,306.517829
Enet_Tuned,487.574519,302.279132
Enet_Tuned_Scalled,463.196037,301.184559
Enet_Tuned_Scalled2,370.966398,271.085462
