In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Data Preprocessing
#### 1 - Reading Input data for White Wine

In [None]:
df = pd.read_csv("/kaggle/input/white-wine-quality/winequality-white.csv", sep=";")
df.head()

In [None]:
print("Our white wine dataset has: {0} rows and {1} columns".format(df.shape[0], df.shape[1]))

#### 2 - Identify NULL Values (if any)

In [None]:
df.isnull().sum()

#### 3 - There are no columns which are unique identifiers. Hence, no column will be removed

In [None]:
df.describe()

#### 4 - Measure skeweness in our dataset

In [None]:
colors = ['#78C850', '#F08030', '#6890F0','#F8D030', '#F85888', '#705898', '#98D8D8']

df.skew()


l = df.columns.values
number_of_columns=df.shape[1]/6
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(10*number_of_columns,5*number_of_rows))
for i in range(0,12):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color=colors[np.random.randint(6)], orient='v')
    plt.tight_layout()

#### 5 - Understanding correlation between variables
#### <u>Observations</u>:
##### 1) Acidity in wine decreases, as we increase the pH value
##### 2) Wine density/thickness increases rapidly as the content of residual sugar increases
##### 3) Alcohol content is inversely proportional to residual sugar. Hence, if sugar content is increased, alcohol qualtiy automatically decreases
##### 4) Wine quality increases, if Alcohol content increases

Hence, the right amount of acids, residual sugar and alcohol, is what will improve our wine quality

In [None]:
plt.figure(figsize=(15,8))
plt.title("Correlation between Variables")
sns.heatmap(df.corr(), linewidths=0.5, cmap="coolwarm", annot=True)
plt.show()

#### Understanding correlation between alcohol and wine quality

In [None]:
sns.scatterplot(df.quality, df.alcohol)

#### 6 - Model Development

In [None]:
from sklearn.model_selection import train_test_split

####################################
# Step 4: Data Sampling

# Take all columns in X except our target variable
df_x = df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10]]

# Take target variable in Y
df_y = df.iloc[:, 11]

####################################
# Step 5: Train-Test split

# Split 20% of data in test data and rest 80% in train. i.e. test_size = .2
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(df_x, df_y, test_size = .2)

In [None]:
df_x.head()

In [None]:
df_y.head()

### Linear Regression, since this is a regression problem
#### We will identify MSE value, and use the model, whichever gives MSE value closer to 0. That will help us ensure that prediction error rate of our model is least

In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

# Create an object using sklearn's LinearRegression() method
model = linear_model.LinearRegression()

# Use Cross-Validation technique for 5-folds to identify best MSE rate
mse = cross_val_score(model, df_x, df_y, scoring="neg_mean_squared_error", cv=5)
mean_mse = np.mean(mse)
print(mean_mse)

#### This MSE Rate is very near to 0. However, let us identify if we can use LASSO/RIDGE to reduce it furthermore

#### 1 - Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
parameters={"alpha": [-1, 0, 0.5, 1]}
ridge_regressor = GridSearchCV(ridge, parameters, scoring="neg_mean_squared_error", cv=5)
ridge_regressor.fit(df_x, df_y)

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)  # This is MSE for ridge

#### 2- Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

lasso = Lasso()
parameters={"alpha": [-1, 0, 0.5, 1]}
lasso_regressor = GridSearchCV(lasso, parameters, scoring="neg_mean_squared_error", cv=5)
lasso_regressor.fit(df_x, df_y)

print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)  # This is MSE for ridge

### MSE rate is least observed using Ridge Regression. So we will use Ridge in our Linear Regression model

In [None]:
ridge_regressor.fit(df_x_train, df_y_train)

# Predict our test data
pred_test = ridge_regressor.predict(df_x_test)

# Predict our train data
pred_train = ridge_regressor.predict(df_x_train)


####################################
# Step 5: Accuracy and Evaluation Metrics

# Evaluation Metrics
Rsquare = ridge_regressor.score(df_x_train, df_y_train)
print("RSquare: " + str(-Rsquare))


K = df_x_train.shape[1]  # Total no. of Columns in Train data
N = df_x_train.shape[0]  # Total no. of Rows/Values/Observations

Adj_Rsquare = 1 - (1 - (-Rsquare)) * (N - 1)/ (N - K - 1)
print("Adjusted RSquare: " + str(Adj_Rsquare))

### 7 - Estimate prediction error rate for our choosen Model
#### <i>It must be between -3 to +3</i>

In [None]:
# Prediction error = Actual - Predicted values
# Find error in train data prediction
error_pred = df_y_train - pred_train

# Plot but with a line and see if it falls within range (-3, 3)
sns.distplot(error_pred)
plt.show()

In [None]:
# Plot model prediction accuracy
plt.figure(figsize=(8,8))
plt.plot(error_pred, "*")

plt.axhline(y = np.mean(error_pred), color="r")

# Mark 3 and -3 limits in diff colors
plt.axhline(y = 20, color = "g")
plt.axhline(y = -20, color = "orange")

Plot Actual vs Predicted values to observe the difference

In [None]:
temp = df_x_test

temp["actual"] = df_y_test
temp["prediction"] = pred_test

temp.head(10)