## LASSO AND RIDGE Regression

In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("../input/hitters/Hitters.csv")
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
col = df.columns  #getting list of column names

In [None]:
# showing column wise %ge of NaN values they contains 

for i in col:
  print(i,"\t-\t", df[i].isna().mean()*100)


> Since  no cloumn has null values except Salary, we can move further without droping columns. (Not considering as null values of this columns will act as test set here)

> Since the given dataset contains both categorical and numerical dataset we have to separate them for further analysis.

In [None]:
num_df = df.select_dtypes(exclude=['object'])
cat_df= df.drop(num_df, axis=1)

In [None]:
num_df.head()

In [None]:
cormap = num_df.corr()
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(cormap, annot = True)

In [None]:
# Simple Function to get the name of top most corelated attributes

def get_corelated_col(cor_dat, threshold): 
  # Cor_data to be column along which corelation to be measured 
  #Threshold be the value above wich of corelation to considered
  feature=[]
  value=[]

  for i ,index in enumerate(cor_dat.index):
    if abs(cor_dat[index]) > threshold:
      feature.append(index)
      value.append(cor_dat[index])

  df = pd.DataFrame(data = value, index = feature, columns=['corr value'])
  return df


In [None]:
top_corelated_values = get_corelated_col(cormap['Salary'], 0.40)
top_corelated_values

> Since above mentioned columns have co relativity above 0.40, we are going to use them further

In [None]:
final_num_df = num_df[top_corelated_values.index]
final_num_df.head()

In [None]:
cat_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_col = cat_df.columns
for i in cat_col:
  enc = LabelEncoder()
  cat_df[i] = enc.fit_transform(cat_df[i].astype('str'))

In [None]:
cat_df.head()


In [None]:
cat_df['Salary'] = df['Salary']  # to get coreltion with target attribute which is Sales Price

In [None]:
cormat = cat_df.corr()
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(cormat, annot = True)

> Since no columns has appreciable co-relation, we are not using categorical collums for predictions

In [None]:
# Final selected 11 most favourable features for prediction

final_df = final_num_df
final_df.head()

> Last row of pair plot graphs shows the plot of each column against our target column i.e. Salary. So from here we can se that none of the chosen column has skewness. 

In [None]:
# Here we are splitting data in train and test as test set are rows with null values in salary column

df_train = final_df.dropna()
df_test = final_df[final_df.isnull().any(axis=1)]


In [None]:
X = df_train.drop(['Salary'], axis=1)
y = df_train['Salary']

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
df_test = pd.DataFrame(scaler.transform(df_test.drop(["Salary"], axis=1)), columns=X.columns)
df_test.head()

In [None]:
#now lets split data in test train pairs

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

### LASSO Regression

> Lasso regression is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters)

In [None]:
# Here we choose LassoCV to per 5 fold cross validation on data so that we can get best alpha while training

from sklearn.linear_model import LassoCV

lasso = LassoCV(cv = 5)
lasso.fit(X_train,y_train)

In [None]:
# Prediction

y_pred = lasso.predict(X_test)

pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.head()

In [None]:
#Evaluating the Model

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 Value:', metrics.r2_score(y_test, y_pred))

> R2 Score of the model is 0.54 which is not very good but since data quite small, upto this much accuracy is acceptable.


In [None]:
# Predicting Values of df_test using the above trained model

df_pred = lasso.predict(df_test)

In [None]:
Predicted_df = pd.concat([ df_test, pd.DataFrame(df_pred, columns=["Predicted Salary"])], axis = 1, sort=False)
Predicted_df.head()

### RIDGE Regression

> Ridge Regression is a technique for analyzing multiple regression data that suffer from multicollinearity. When multicollinearity occurs, least squares estimates are unbiased, but their variances are large so they may be far from the true value.

In [None]:
# Here we choose RidgeCV to per 5 fold cross validation on data so that we can get best alpha while training

from sklearn.linear_model import RidgeCV

ridge = RidgeCV(cv = 5)
ridge.fit(X_train,y_train)

In [None]:
# Prediction

y_pred_ = ridge.predict(X_test)

pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_})
pred_df.head()

In [None]:
#Evaluating the Model

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_)))
print('R2 Value:', metrics.r2_score(y_test, y_pred_))

> R2 Score of the model is 0.53 which is not very good but since data quite small, upto this much accuracy is acceptable.


In [None]:
# Predicting Values of df_test using the above trained model

df_pred_ = ridge.predict(df_test)

In [None]:
Predicted_df_ = pd.concat([ df_test, pd.DataFrame(df_pred_, columns=["Predicted Salary"])], axis = 1, sort=False)
Predicted_df_.head()