In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import optimize

df=pd.read_csv('/kaggle/input/wine-quality/winequalityN.csv')
df.head()

In [None]:
for i in df.columns:
    if df[i].dtype== 'O':
        df[i].replace({'white':1,'red':0},inplace=True)

In [None]:
df.isna().any()


Now i will impute the missing values using linear regression, based on correlation between each null column (treated as target column) and all other columns/features. I will be using threshold correlation of (corr>0.25 | corr<-0.25) between target column and other features.  

In [None]:
corr_mat=df.corr()
corr_mat

In [None]:
df1=df.copy()
df_null=df1.loc[:,(df1.isnull().sum()>0)]
null_cols=df_null.columns
null_cols

**All Functions used for imputation**

In [None]:

def normalization(X):
    mu=np.mean(X,axis=0)
    sigma=np.std(X,axis=0)
    X_norm=np.zeros(X.shape)
    m,n=X.shape
    i=0
    while i<n:    
        X_norm[:,i]=(X[:,i]-np.mean(X[:,i]))/(np.std(X[:,i]))
        i+=1

    return X_norm,mu,sigma

# LINEAR REGRESSION (FROM SCRATCH)

def lin_reg_CF(theta,X,y,lambda_):
    
    m=X.shape[0]
    X=np.concatenate([np.ones((m,1)),X],axis=1)
    J = 0
    grad = np.zeros(theta.shape)

    h = X.dot(theta)
    J = (1 / (2*m)) * np.sum(np.square(h - y)) + (lambda_ / (2*m)) * np.sum(np.square(theta[1:]))
    grad= np.dot((np.dot(X,theta)-y),X)*(1/m)
    grad[1:] = grad[1:] + (lambda_ / m) * theta[1:]
    
    return J,grad

def prediction(X,theta):
    X=np.concatenate([np.ones((X.shape[0],1)),X],axis=1)
    return np.dot(X,theta)

def preprocessing(cols,df1,target_col):

    df1.dropna(axis=0,inplace=True)
    new_df=df1.loc[:,cols]
    y=new_df.loc[:,target_col].to_numpy()
    X=new_df.iloc[:, new_df.columns!=target_col].to_numpy()

    X_norm,mu,sigma=normalization(X)
    return X_norm,y,mu,sigma

# This function will impute every null values in the dataset using Linear regression.
def Imputation_LR(null_cols,df):

    for i in null_cols:
        df1=df.copy()

        s=(corr_mat[i]>0.25) | (corr_mat[i]<-0.25)  
        new_df_cols=df1.columns[s]
        
        X, y, mu, sigma= preprocessing(new_df_cols,df1,i)        
        initial_theta= np.zeros(X.shape[1]+1)
        
        # Here scipy's advanced optimization method are utilized. It takes the cost function and other parameters which 
        # must accurately return cost and gradient to do accurate internal update of parameters after each iteration. 
        # It also internally select best value of learning rate (alpha).
        res= optimize.minimize(lin_reg_CF,initial_theta,(X,y,0),jac=True,method='TNC',)
        
        # Preparing to fill null rows of ith (null) column.
        null_rows_index= df[i].index[df[i].apply(np.isnan)]
        null_rows_df= df.loc[null_rows_index,new_df_cols]
        # Dropping the target col or ith col because we have to predict that column (containing NAN values).
        null_rows_df.drop([i],axis=1,inplace=True)

        # Filling the nan values of other features (if any) by there mean for the prediction of null values.
        null_rows_df.fillna(np.mean(df),inplace=True)
        
        # Normalizing the features null_rows_df by the main Dataframe's (mean and std) for predictions.
        null_rows_df -= mu
        null_rows_df /= sigma

        # res.x contains optimal or trained parameters or coefficients of ith LR model.
        null_values_prediction=prediction(null_rows_df,res.x)

        # Now filling or imputing the missing values, into our main dataframe (df), generated by our ith linear regression model for            ith null column. 
        df.loc[null_rows_index,i]= null_values_prediction          
    return df

In [None]:
df.isnull().sum()

In [None]:
df_final=Imputation_LR(null_cols,df)

In [None]:
df.isnull().sum()

**Linear regression and Ridge Regression**

I will use Ridge Reg (check lin_reg_CF() function for formula) which will help us in seeing model's bias and variance.

In [None]:
df2=df.copy()

In [None]:
df2.max()-df2.min()

This is the reason why we need to do normalization because it will avoid the biasness of features with comparatively higher values than the features with lower values.

In [None]:
from sklearn.model_selection import train_test_split

X,y=  df2.iloc[:,:-1].to_numpy(),  df2.iloc[:,-1].to_numpy()
X_train,x_cv_test,y_train,y_cv_test= train_test_split(X,y,train_size=0.6,random_state=43)
X_cv, X_test, y_cv, y_test = train_test_split(x_cv_test,y_cv_test,train_size=0.5,random_state=43)

X_train.shape,X_cv.shape,X_test.shape

In [None]:
# NORMALZING THE FEATURES FOR TRAINING SET
m,n=X_train.shape
X_train_norm,mu,sigma=normalization(X_train)

# normalizing the features of cross validation set
X_cv -= mu
X_cv /= sigma

# Normalizing the features of test set
X_test -= mu
X_test /= sigma

In [None]:
# Training the model and computing the cost or MSE on train and CV set after hundred iterations
m,n= X_train.shape
initial_theta=np.zeros(n+1)
res=optimize.minimize(lin_reg_CF, initial_theta, (X_train_norm,y_train,0), jac=True, method='TNC', options={'maxiters':100})

j_train,_=lin_reg_CF(res.x, X_train_norm, y_train, 0)
j_cv,_=lin_reg_CF(res.x, X_cv, y_cv, 0)

print('MSE on train set:',j_train)
print('MSE on CV set:',j_cv)
print('Optimal parameters or cofficients of our LR model:',res.x)

As we can see the MSE on train set is very close to Cv set and both are nearly equal. It can be the case of high bias or vey simple model (less complexity in our hypohesis) or underfitting which we will confirm by varying lambda and try to improve the model performance by varying the regularization parameter to get the better fit state of our model.

NOTE: I will be using ridge linear regression you can check the lin_reg_CF() function for formula.

Since this might be the case of underfitting so we will try to see the variations between j_cv and j_train and we expect to get improvement in accuracy when lambda decreases. If you are familiar with andrew ng's course you would know what it means.

In [None]:
def lambda_tuning(X,y,X_cv,y_cv):
    m,n=X.shape
#     lambda_=np.linspace(10e-5,1,num=500)
    lambda_=np.linspace(100,10e5,num=500)

    err_train=np.array([])
    err_cv=np.array([])

    for i in lambda_:
        initial_theta=np.zeros(n+1)
        res=optimize.minimize(lin_reg_CF,initial_theta,(X,y,i),jac=True,method="TNC",options={'maxiters':100})
        
        j_train,_=lin_reg_CF(res.x, X, y, lambda_=0)
        err_train=np.append(err_train,j_train)
        
        j_cv,_=lin_reg_CF(res.x, X_cv, y_cv, lambda_=0)
        err_cv=np.append(err_cv,j_cv)
    
    return lambda_,err_train,err_cv

In [None]:
lambda_, error_train, error_val = lambda_tuning(X_train_norm, y_train, X_cv, y_cv)
plt.figure(figsize=(10,5))
plt.plot((lambda_), error_train, '-o', (lambda_), error_val, '-o', lw=2)
plt.legend(['Train', 'Cross Validation'])
plt.xlabel('lambda')
plt.ylabel('Error (MSE)')
plt.grid(True)

This is the plot of mse vs lambda, it is clear that there is no variation in j_cv and j_train when lambda decreased upto 10e-5. So we may conclude that the lambda parameter has no affect on the errors. Usually the errors are supposed to further decrease but it did not decreased instead are constant. I am not sure what could be the reason but do look (at last cell) at my model's generalization error on test set. I think this could be due to i have initially normalized all features.

For the sake of completeness, we will now also try to see the variations in errors when lambda is increased which simply means we will be decreasing variance or increasing the bias of our model. In this case we expect to see rise in j_cv and j_train as lambda increases.

In [None]:
lambda_, error_train, error_val = lambda_tuning(X_train_norm, y_train, X_cv, y_cv)
plt.figure(figsize=(10,5))
plt.plot(lambda_, error_train, '-o', lambda_, error_val, '-o', lw=2)
plt.legend(['Train', 'Cross Validation'])
plt.xlabel('lambda')
plt.ylabel('Error (MSE)')
plt.grid(True)



Hence we may conclude that our model is working fine and we will now check the mse of our model on test set to generalize this model performance.

In [None]:
j_test,_=lin_reg_CF(res.x, X_test, y_test, 0)
print('MSE on test set:',j_test)

We can clearly see that the test error is nearly equal to train error hence we can conclude that the model generalizes well on test set.  