<a href="https://colab.research.google.com/github/sdiaz8/sdiaz8/blob/main/Lasso%20Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Download usual libraries and LASSO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean

In [None]:
#import mlb dataset
mlbDF = pd.read_csv('/content/mlb.csv')
mlbDF.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [None]:
#replace NaN values with column mean
mlbDF.fillna(mlbDF.mean(), inplace=True)
mlbDF.head()

  


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,535.925882,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [None]:
#Use 'Salary' as target 
y = mlbDF['Salary']

In [None]:
#drop League, Division and NewLeague
newMLB=mlbDF.drop(['League','Division','NewLeague','Salary'],axis=1)

In [None]:
#split data set including 25% of data
X_train, X_test, y_train, y_test = train_test_split(newMLB, y, test_size = 0.25)

In [None]:
#using salary mean for NaN values 
X_test.fillna(X_train.mean(), inplace=True)
y_test.fillna(y_train.mean(), inplace=True)

In [None]:
#fit LASSO model
lasso = Lasso(alpha = 2, max_iter=10000)
lasso.fit(X_train, y_train)
print(lasso.score(X_test, y_test))

0.47380841975482146


In [None]:
#Lasso coefficeints and intercept
lasso.coef_

array([-1.45781584,  4.20878269,  0.52105439, -0.63560598,  1.40280572,
        4.91684906, -4.465754  , -0.12711537,  0.02240553, -1.39899041,
        1.06869867,  0.78663999, -0.30861012,  0.13628483,  0.24585415,
       -3.89275844])

In [None]:
lasso.intercept_

264.1177110940238

In [None]:
#using different values of alpha (0, 1000, 100)
lasso0 = Lasso(alpha = 0, max_iter=3000000)
lasso0.fit(X_train, y_train)
print(lasso0.score(X_test, y_test))

lasso1000 = Lasso(alpha = 1000, max_iter=2600000)
lasso1000.fit(X_train, y_train)
print(lasso1000.score(X_test, y_test))

lasso100 = Lasso(alpha = 100, max_iter=2600000)
lasso100.fit(X_train, y_train)
print(lasso100.score(X_test, y_test))

  This is separate from the ipykernel package so we can avoid doing imports until
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


0.474496897296932
0.3907835115012367
0.4600241033281661


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [None]:
#top 5 predictions with test data
y_pred_test = lasso.predict(X_test)
y_pred_test[:5]

array([739.55801321, 444.44315378, 283.53312577, 501.92737396,
       426.96203463])

In [None]:
#top 5 predictions with training data
y_pred_train = lasso.predict(X_train)
y_pred_train[:5]

array([382.69285647, 267.3698797 , 797.5012765 , 606.53760392,
       456.63537926])

In [None]:
#Mean squared error and R2 on test dataset
mse = mean_squared_error(y_test,y_pred_test)
print("Mean squared error of the test data is",mse)

r2 = r2_score(y_test,y_pred_test)
print("R2 of the test data is",r2)

Mean squared error of the test data is 124648.63854491588
R2 of the test data is 0.47380841975482146


In [None]:
#use LassoCV to find optimal alpha value
lassocv = LassoCV(cv=10, max_iter=100000).fit(X_train,y_train)
alphaCV = lassocv.alpha_
print("Optimal alpha value of the LassoCV model is", alphaCV)

Optimal alpha value of the LassoCV model is 391.31935562261


In [None]:
#provide predictions and mean squared error of LassoCV
y_pred_test_CV = lassocv.predict(X_test)
y_pred_test_CV

array([ 660.42972416,  420.21344406,  362.56392847,  479.27995893,
        417.10440189,  325.55112081,  649.56767118,  285.89632699,
        304.3384599 ,  848.8900252 ,  881.76602268,  282.91682692,
        506.43813631,  417.65582295,  581.49680252,  959.2845349 ,
        465.76501678,  349.86816321,  660.18273477,  494.5570709 ,
        618.05000548,  424.17144481,  530.98678725,  377.35910359,
        641.19859441, 1060.72327397,  617.82445933,  339.28855813,
        457.26525866,  629.22386796,  327.88568012,  381.07109991,
        324.49862893,  366.94182268,  413.61131946,  490.19481214,
        359.9247863 ,  462.16989206,  369.92092043,  558.05323294,
       1094.98884405,  342.37210624,  803.85149425,  371.88775443,
        348.87549376,  925.72113825,  593.95507439,  876.487569  ,
        371.8537248 ,  461.11579489,  554.21686225,  478.15594179,
        780.85945215,  329.75070811,  301.95485952,  470.07741486,
        272.00713416,  340.94948044, 1136.09087389,  288.86133

In [None]:
mseCV = mean_squared_error(y_test,y_pred_test_CV)
print("Mean squared error of the test data is",mseCV)

Mean squared error of the test data is 135068.81071291707


In [None]:
#coefficients of Tuned Model
lassocv.coef_

array([-0.        ,  0.6477403 ,  0.        ,  0.        ,  0.        ,
        2.26955011, -0.        , -0.20920963,  0.64212255, -0.        ,
        0.43294804,  0.19088437,  0.02557074,  0.12852849,  0.03631867,
       -0.        ])

The variables that were not used in the model are AtBat, HmRun, Runs, RBI, Years, CHmRun, and Salary.  These variables were discarted because they have 0 importance.