In [None]:
#importing important liberaries

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
# LOading the Data
df = pd.read_csv('/kaggle/input/baseball-player-salary-prediction/Player.csv')
df

In [None]:
# viewing Raw data
df.head()

In [None]:
# Dimension of data
df.shape

In [None]:
# column data Types
df.info()

In [None]:
# Checking null values
df.isnull().sum()

In [None]:
#Droping NUll values
df.dropna(inplace = True)

# Statistical Analysis and Encode Variables

*** Statistical Data analysis ***

In [None]:

pd.set_option('precision',3)
df.describe()

*** Categorical Variable Analysis ***

In [None]:
# League columns
leag = df.League.value_counts()
print(leag)
leag.plot.pie()


In [None]:
# Division column
div = df.Division.value_counts()
print(div)
div.plot.pie()


In [None]:
# For New League
Nleag = df.NewLeague.value_counts()
print(Nleag)
Nleag.plot.pie()


***Encode Categorical Variables ***

In [None]:
dummyCol = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummyCol.head()

In [None]:
dummyCol.info()

# Preparing INPUT/OUTPUT Variables

***Input Variables***

In [None]:
XTemp = df.drop(['Unnamed: 0','Salary', 'League', 'Division', 'NewLeague'], axis=1)

X = pd.concat([XTemp, dummyCol[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)

X.info()

*** Output Variable ***

In [None]:
y = df.Salary
y.head()

# Ridge Regression

In [None]:
alphas = 10**np.linspace(10,-2,100)*0.00001
ridge = Ridge()
coeffs = []

In [None]:
scaler = StandardScaler() 
for a in alphas: 
    ridge.set_params(alpha=a) 
    ridge.fit(scaler.fit_transform(X), y) 
    coeffs.append(ridge.coef_) 

In [None]:
np.shape(coeffs)

In [None]:
ax = plt.gca() 
ax.plot(alphas, coeffs) 
ax.set_xscale('log') 
plt.axis('tight') 
plt.xlabel('Lambda') 
plt.ylabel('Coefficients') 
plt.title('Ridge coefficients as a function of lambda(alpha)')

In [None]:
pd.set_option('precision', 6)                              
X_train, X_test , y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5, random_state=1)  

In [None]:
def Ridge_Regression(alpha):
    scaler = StandardScaler()
    ridge = Ridge(alpha=alpha)
    ridge.fit(scaler.fit_transform(X_train),y_train)
    pred = ridge.predict(scaler.fit_transform(X_test))
    print(pd.Series(ridge.coef_,index=X.columns))
    print('Mean Squared Error: ', mean_squared_error(y_test, pred))

*** Ridge REgression Lambda 0 ***

In [None]:
Ridge_Regression(0)

*** Ridge REgression Lambda 5 ***

In [None]:
Ridge_Regression(5)

*** Ridge REgression Lambda 10 ***

In [None]:
Ridge_Regression(10)

*** Ridge REgression Lambda 100 ***

In [None]:
Ridge_Regression(100)

*** Ridge REgression Lambda 1000 ***

In [None]:
Ridge_Regression(1000)

*** Ridge REgression Lambda 10^10 ***

In [None]:
Ridge_Regression(10**10)

# Cross Validation - RidgeCV

***RidgeCV (cv=none)***

In [None]:
scaler = StandardScaler()
ridgecv = RidgeCV(alphas=alphas)
ridgecv.fit(scaler.fit_transform(X_train), y_train)
ridgecv.alpha_ 

*** RidgeCV (cv=5)***

In [None]:
scaler = StandardScaler()
ridgecv = RidgeCV(alphas=alphas, cv = 5)
ridgecv.fit(scaler.fit_transform(X_train), y_train)
ridgecv.alpha_ 

***RidgeCV (cv = 10)***

In [None]:
scaler = StandardScaler()
ridgecv = RidgeCV(alphas=alphas, cv = 10)
ridgecv.fit(scaler.fit_transform(X_train), y_train)
ridgecv.alpha_ 

# Optimal Rigid Regression

In [None]:
scaler = StandardScaler()
opt_ridge = Ridge(alpha=ridgecv.alpha_) 
opt_ridge.fit(scaler.fit_transform(X_train), y_train) 
pred = opt_ridge.predict(scaler.fit_transform(X_test))
mean_squared_error(y_test, pred)

In [None]:
opt_ridge.fit(X,y)
pd.Series(opt_ridge.coef_, index = X.columns)

# Lasso Regression

In [None]:
lasso = Lasso(max_iter = 100000)
coefs = []
scaler = StandardScaler()

In [None]:
for a in alphas:
    lasso.set_params(alpha = a)
    lasso.fit(scaler.fit_transform(X_train), y_train)
    coefs.append(lasso.coef_)

In [None]:
ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('Lambda')
plt.ylabel('Coddicients')
plt.title('Lasso coefficients as function of Lambda(alpha)')
plt.show()

# LassoCV

*** LassoCV - cv = 10 ***

In [None]:
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000)
lassocv.fit(scaler.fit_transform(X_train), y_train)

In [None]:
lassocv.alpha_

# Optimal Lasso

In [None]:
lasso.set_params(alpha = lassocv.alpha_)
lasso.fit(scaler.fit_transform(X_train), y_train)
mean_squared_error(y_test, lasso.predict(scaler.fit_transform(X_test)))

In [None]:
pd.Series(lasso.coef_, index=X.columns) 

*** Coefficient !=0 ***

In [None]:
pd.Series(lasso.coef_[lasso.coef_ !=0], index=X.columns[lasso.coef_ !=0])

# Principle Components Analysis

*** Import Liberaries ***

In [None]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

*** Principle COmponents ***

In [None]:
pca = PCA()
x_pca_reduced = pca.fit_transform(scale(X))
print(pca.components_.shape)

In [None]:
pd.DataFrame(pca.components_.T).loc[:4,:4]

***Principal Components Variance Explained***

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

*** Implementing PCA ***

In [None]:
pca_t = PCA() 

X_train_reduced_pca = pca_t.fit_transform(scale(X_train)) 

n = len(X_train_reduced_pca) 

kfold10 = KFold(n_splits=10)

lm_regr = LinearRegression()

mse = []                                                               

tr_score = -1*cross_val_score(lm_regr, np.ones((n,1)),y_train, cv=kfold10, scoring='neg_mean_squared_error').mean()
    
mse.append(tr_score) 

for i in np.arange(1, 20): 
    tr_score = -1*cross_val_score(lm_regr, X_train_reduced_pca[:,:i], y_train, cv=kfold10, scoring='neg_mean_squared_error').mean() 
    mse.append(tr_score) 

plt.plot(mse, '-v') 
plt.xlabel('Number of principal components in regression') 
plt.ylabel('MSE') 
plt.title('Player Salary') 
plt.xlim(xmin=-1); 
plt.plot()

In [None]:
# MSE for Each Component
mse_each_comp=pd.Series(np.array(mse).flatten(), index = np.arange(1,21)) 
mse_each_comp

In [None]:
# Minimum MSE
np.amin(mse_each_comp)

In [None]:
# Regression MOdel with reduced Component
X_test_reduced_pca = pca_t.transform(scale(X_test))[:,:7] 

regr = LinearRegression() 

regr.fit(X_train_reduced_pca[:,:7], y_train) 

pred = regr.predict(X_test_reduced_pca) 

mean_squared_error(y_test, pred) 

In [None]:
regr.coef_