# Exercise - Regularization with Cross-validation

In this assignment, you will practice the obtained skills on regularization.
    
Your task is to create ridge and lasso regression and to perform cross-validation for choosing a tuning parameteter on the 'Hitters' dataset. After that, you'll be reqiuired to make predictions and evaluate the performance of the models.

Good luck and have fun!

## Import the necessary libraires for integrating ridge and lasso regression including a cross-validator

In [3]:
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

## Load the data

In [4]:
data = pd.read_csv('Hitters.csv')
df_hitters_ex = data.copy()
df_hitters_ex

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


## Convert categorical variables into numerical and drop NaN values

In [5]:
df_hitters_num = pd.get_dummies(df_hitters_ex, columns = ['League', 'Division', 'NewLeague'], drop_first=True)
df_hitters_num

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_N,Division_W,NewLeague_N
0,293,66,1,30,29,14,1,293,66,1,30,29,14,446,33,20,,False,False,False
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,True,True,True
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,False,True,False
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,True,False,True
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,700.0,True,False,True
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,875.0,False,False,False
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,385.0,False,True,False
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,960.0,False,False,False


In [6]:
df_hitters_num_nonull = df_hitters_num.dropna()

## Declare the dependent and independent variables

In [7]:
x = df_hitters_num_nonull.drop('Salary', axis = 1)
y = df_hitters_num_nonull['Salary']

## Split the data into training and tesitng

* Perform a 70:30 split - 30% of the data should be dedicated to testing.
* Set the random state to 365.

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=365)

In [9]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Initialize a Repeated K-Fold cross-validator

* n_splits=15, n_repeats=5, random_state=365

In [10]:
cv = RepeatedKFold(n_splits=15, n_repeats=5, random_state=365)

## Perform ridge regression with Repeated K-Fold cross-validation

* np.arange(0.1, 15, 0.1)
* scoring = 'neg_mean_absolute_error'

In [11]:
ridge = RidgeCV(alphas=np.arange(0.1, 15, 0.1), cv=cv, scoring='neg_mean_absolute_error')

## Perform lasso regression with Repeated K-Fold cross-validation

* tol=1

In [12]:
lasso = LassoCV(alphas=np.arange(0.1, 15, 0.1), cv=cv, tol = 1)

## Make predictions on the test data using the two models and evaluate their performance considering the root mean squared error 

In [13]:
ridge.fit(x_train,y_train)

In [14]:
lasso.fit(x_train,y_train)

In [15]:
ridge_reg_y_pred = ridge.predict(x_test)

In [16]:
lasso_reg_y_pred = lasso.predict(x_test)

In [17]:
print("Ridge Regression Model RMSE is: ", math.sqrt(mean_squared_error(y_test, ridge_reg_y_pred)))
print("Lasso Regression Model RMSE is: ", math.sqrt(mean_squared_error(y_test, lasso_reg_y_pred)))

Ridge Regression Model RMSE is:  272.19199168552916
Lasso Regression Model RMSE is:  287.7136264572571
