In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import Libraries
Let us import the required libraries and functions

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 'Scikit-learn' (sklearn) emphasizes various regression, classification and clustering algorithms
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

# 'Statsmodels' is used to build and analyze various statistical models
import statsmodels
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.tools.eval_measures import rmse
from statsmodels.compat import lzip
from statsmodels.graphics.gofplots import ProbPlot

# 'SciPy' is used to perform scientific computations
from scipy.stats import f_oneway
from scipy.stats import jarque_bera
from scipy import stats

# Read Data

In [None]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df.head()

# Data Analysis and Preparation

### Data preparation is the process of cleaning and transforming raw data prior to building predictive models.

##### Here we will analyze and prepare data to perform regression analysis:
1. Check dimensions of the dataframe in terms of rows and columns
2. Check the data types. Refer data definition to ensure your data types are correct
3. If data types are not as per business context, change the data types as per requirement
4. Study summary statistics
5. Check for missing values
6. Study correlation
7. Perform feature engineering
8. Detect outliers
9. Recheck the correlation

##### Note: It is an art to explore data and one will need more and more practice to gain expertise in this area.

### Understand the Dataset

In [None]:
df.shape

In [None]:
df.dtypes

### Summary Statistics


In [None]:
df.describe()

### Missing Value

In [None]:
Total = df.isnull().sum().sort_values(ascending=False) 

Percent = (df.isnull().sum()*100/df.isnull().count()).sort_values(ascending=False)   

missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])

# print the missing data
missing_data

#### Visualize the Null Values


In [None]:
# set the figure size
plt.figure(figsize=(15, 8))

# plot heatmap to check null values
# isnull(): returns 'True' for a missing value
# cbar: specifies whether to draw a colorbar; draws the colorbar for 'True' 
sns.heatmap(df.isnull(), cbar=False)

# display the plot
plt.show()

## Checking the correlation


In [None]:
num_col = df.select_dtypes(include=np.number)
num_col.columns

In [None]:
corr = num_col.corr()

# print the correlation matrix
corr

In [None]:
plt.figure(figsize=(15, 8))

sns.heatmap(corr, cmap='YlGnBu', vmax=1.0, vmin=-1.0,annot = True, annot_kws={"size": 15}, )

# specify name of the plot using plt.title()
plt.title('Correlation between numeric features')

# display the plot
plt.show()

# Base Linear Model

In [None]:
from sklearn.linear_model import LinearRegression
X = df.drop('alcohol', axis=1)

# extract the target variable from the data set
y = df['alcohol']

# split data into train subset and test subset for predictor and target variables
# random_state: the seed used by the random number generator
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_train_pred= model.predict(X_train)
y_test_pred = model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(r2_train, r2_test)
print(rmse_train, rmse_test)

## Discover Outliers


###### Importance of detecting an outlier
An outlier is an observation that appears to deviate distinctly from other observations in the data. If the outliers are not removed, the model accuracy may decrease.

###### Recollect that one of the assumptions of Linear Regression is there should be no outliers present in the data

In [None]:
sns.pairplot(df)

In [None]:
# set the plot size
plt.rcParams['figure.figsize']=(18,8)

# create a boxplot for all numeric features
# column: selects the specified columns
df.boxplot()
# to display the plot
plt.show()

### Using IQR Method

In [None]:
Q1 = df.drop(['alcohol'], axis=1).quantile(0.25)

# compute the first quartile using quantile(0.75)
# use .drop() to drop the target variable 
# axis=1: specifies that the labels are dropped from the columns
Q3 = df.drop(['alcohol'], axis=1).quantile(0.75)

# calculate of interquartile range 
IQR = Q3 - Q1

# print the IQR values for numeric variables
print(IQR)

In [None]:
# filter out the outlier values
# ~ : selects all rows which do not satisfy the condition
# |: bitwise operator OR in python
# any() : returns whether any element is True over the columns
# axis : "1" indicates columns should be altered (use "0" for 'index')
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
df.shape

### Rechecking after the Outlair detection


In [None]:
# set figure size 
plt.rcParams['figure.figsize']=(15,8)

# recheck for outliers
# column: selects the specifies columns
df.boxplot()
# display only the plot
plt.show()

### Recheck the Correlation
##### Recheck the correlation after treating outliers. An outlier might either decrease or increase a correlation coefficient, depending on where it is in relation to the other points

In [None]:
# generate the correlation matrix 
corr =  df.corr()

# print the correlation matrix
corr

In [None]:
sns.heatmap(corr, cmap='YlGnBu', vmax=1.0, vmin=-1.0, annot = True, annot_kws={"size": 15})

# specify name of the plot
plt.title('Correlation between numeric features')

# display the plot
plt.show()

## Linear Regression (OLS)

In [None]:
Xc=sm.add_constant(X)
ol = sm.OLS(y,Xc).fit()

# print the summary output
print(ol.summary())

### VIF 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

vf=[vif(Xc.values,i) for i in range(Xc.shape[1])]

pd.DataFrame(vf,index=Xc.columns,columns=['vif'])

# Feature Engineering


### RFE -Recursive Feature Elimination

In [None]:
cols=list(X.columns)


for col in cols:
    X[col+'_2']=X[col]**2
X.head()

In [None]:
X.shape

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

lir=LinearRegression()

rfe = RFE(lir,n_features_to_select=10)
rfe.fit(X,y)

pd.DataFrame(rfe.ranking_,index=X.columns,columns=['select']).sort_values(by='select')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

no_of_cols=12
train_score=[]
test_score=[]

for n in range(no_of_cols):
    lir = LinearRegression()
    rfe = RFE(lir, n_features_to_select=n+1)
    rfe.fit(X_train, y_train)
    
    score1 = rfe.score(X_train, y_train)
    train_score.append(score1)

    score2 = rfe.score(X_test, y_test)
    test_score.append(score2)


In [None]:
plt.plot(train_score,'g')
plt.plot(test_score,'r')
plt.show()

In [None]:
idx=np.linspace(1,12,12)
rf=pd.DataFrame(test_score,columns=['r-sq']).sort_values(by='r-sq',ascending=False)
rf

### RFE Cv

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
lir=LinearRegression()
rfecv=RFECV(lir,cv=3,scoring='r2')
rfecv.fit(X,y)

In [None]:
rfecv.grid_scores_

In [None]:
plt.plot(range(1,23),rfecv.grid_scores_)

In [None]:
rf=pd.DataFrame(rfecv.grid_scores_,index=range(1,23),columns=['scores'])
rf.sort_values(by='scores',ascending=False)

In [None]:
pd.DataFrame(rfecv.ranking_, index=X.columns, columns=['select']).sort_values(by='select')


In [None]:
y = df['alcohol']
X1 =df.drop(['alcohol','total sulfur dioxide'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=42)

lir = LinearRegression(fit_intercept=True)

lir.fit(X_train, y_train)
y_train_pred = lir.predict(X_train)

r2_Train = r2_score(y_train, y_train_pred)
rmse_Train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print('r2-Train: ', r2_Train, 'rmse_Train: ', rmse_Train)

y_test_pred = lir.predict(X_test)

r2_Test = r2_score(y_test, y_test_pred)
rmse_Test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('r2-Test: ', r2_Test, 'rmse_Test: ', rmse_Test)


### Forward Selection Approaches 


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

y=df['alcohol']
X=df.drop('alcohol',axis=1)

cols=list(X.columns)

for col in cols:
    X[col+'_2']=X[col]**2
X.head()

In [None]:
lir = LinearRegression()
sfs1=sfs(lir,k_features=22,forward=True,scoring='r2',cv=3,verbose=2)
sfs1=sfs1.fit(X,y)

In [None]:
sf=pd.DataFrame(sfs1.subsets_).T
sf

In [None]:
plt.figure(figsize=(10,5))
plt.plot(sf.index,sf['avg_score'])
plt.xlabel('number of features')
plt.ylabel('r-square')
plt.show()

In [None]:
sf[sf['avg_score']==sf['avg_score'].max()]['feature_names']

In [None]:
sfs1=sfs(lir,k_features=13,forward=True,scoring='r2',cv=3)
sfs1=sfs1.fit(X,y)

In [None]:
selected_features=list(sfs1.k_feature_names_)
selected_features

In [None]:
y = df['alcohol']
X1 = X[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=42)

lir = LinearRegression(fit_intercept=True)

lir.fit(X_train, y_train)
y_train_pred = lir.predict(X_train)

r2_Train = r2_score(y_train, y_train_pred)
rmse_Train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print('r2-Train: ', r2_Train, 'rmse_Train: ', rmse_Train)

y_test_pred = lir.predict(X_test)

r2_Test = r2_score(y_test, y_test_pred)
rmse_Test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('r2-Test: ', r2_Test, 'rmse_Test: ', rmse_Test)


# Regularition


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split,cross_val_score,KFold,LeaveOneOut
from sklearn.datasets import load_boston
boston= load_boston()
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import Lasso,LassoCV,Ridge,RidgeCV,ElasticNet,ElasticNetCV

### Lassso regression


In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
Xs=ss.fit_transform(X)
Xs=pd.DataFrame(Xs,columns=X.columns)

lasso=Lasso(alpha=0.1,max_iter=10000)
lasso.fit(X,y)

pd.DataFrame(lasso.coef_,index=X.columns,columns=['coef'])

In [None]:
y_pred=lasso.predict(Xs)
r2_score(y,y_pred)

#### Simulation to understand the impact of alpha on coeffecient

In [None]:
np.linspace(1,10,10)

np.logspace(-4,1,10)

alphas=np.logspace(-3,-1,10)
coefs=[]
for a in alphas:
    lasso=Lasso(alpha=a,max_iter=10000)
    lasso.fit(Xs,y)
    coefs.append(lasso.coef_)
    
    
plt.figure(figsize=(10, 5))   
plt.plot(alphas,coefs)
plt.xlabel('alphas')
plt.ylabel('coeffecients')

### Tunning to find out the best alpha

In [None]:
alphas=np.logspace(-3,0,50)

lassocv=LassoCV(alphas=alphas,cv=3,max_iter=10000,random_state=5)
lassocv.fit(Xs,y)

In [None]:
lassocv.alpha_

In [None]:
lasso=Lasso(alpha=lassocv.alpha_,max_iter=10000)
lasso.fit(X,y)
pd.DataFrame(lasso.coef_,index=X.columns,columns=['coef'])

In [None]:
y_pred=lasso.predict(Xs)
r2_score(y,y_pred)

## Rechecking The Linear OlS Model

In [None]:
Xc=sm.add_constant(X)
ol = sm.OLS(y,Xc).fit()

# print the summary output
print(ol.summary())

### Finally I got a better model as compair to the base model which I created .
### R2 value now 76% with the above Linear Regression prcocess. which is a good fit model.