# Multiple Linear Regression Example

This example is focussed on predicting the quality of red wine based on different attributes of the wine.

In [None]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Visualization
import seaborn as sns #Visualization

# Enabling function autocomplete
%config Completer.use_jedi = False

In [None]:
# Step 1: Load Data
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# Step 2: Filtering data
df.dropna()
df.drop_duplicates()

In [None]:
# Step 3: Data Visualization
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Outlier Analysis
fig, axs = plt.subplots(10, figsize = (8,20))
plt1 = sns.boxplot(df['fixed acidity'], ax = axs[0]).set_title('fixed acidity')
plt2 = sns.boxplot(df['volatile acidity'], ax = axs[1]).set_title('volatile acidity')
plt3 = sns.boxplot(df['citric acid'], ax = axs[2]).set_title('citric acid')
plt4 = sns.boxplot(df['residual sugar'], ax = axs[3]).set_title('residual sugar')
plt5 = sns.boxplot(df['chlorides'], ax = axs[4]).set_title('chlorides')
plt6 = sns.boxplot(df['free sulfur dioxide'], ax = axs[5]).set_title('free sulfur dioxide')
plt7 = sns.boxplot(df['total sulfur dioxide'], ax = axs[6]).set_title('total sulfur dioxide')
plt8 = sns.boxplot(df['pH'], ax = axs[7]).set_title('pH')
plt9 = sns.boxplot(df['sulphates'], ax = axs[8]).set_title('sulphates')
plt10 = sns.boxplot(df['alcohol'], ax = axs[9]).set_title('alcohol')
plt.tight_layout()

There are outliers in different fields here in the data.

In [None]:
df = df[(df['fixed acidity'] < 14) & 
        (df['volatile acidity'] < 1) & 
        (df['citric acid'] < 0.8) &
        (df['residual sugar'] < 4) &
        (df['chlorides'] < 0.2) &
        (df['free sulfur dioxide'] <40) &
        (df['pH'] > 3) & (df['pH'] < 3.8) &
        (df.sulphates < 1.25) & 
        (df.alcohol < 13)]

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# Let's review the outliers again
fig, axs = plt.subplots(10, figsize = (8,20))
plt1 = sns.boxplot(df['fixed acidity'], ax = axs[0]).set_title('fixed acidity')
plt2 = sns.boxplot(df['volatile acidity'], ax = axs[1]).set_title('volatile acidity')
plt3 = sns.boxplot(df['citric acid'], ax = axs[2]).set_title('citric acid')
plt4 = sns.boxplot(df['residual sugar'], ax = axs[3]).set_title('residual sugar')
plt5 = sns.boxplot(df['chlorides'], ax = axs[4]).set_title('chlorides')
plt6 = sns.boxplot(df['free sulfur dioxide'], ax = axs[5]).set_title('free sulfur dioxide')
plt7 = sns.boxplot(df['total sulfur dioxide'], ax = axs[6]).set_title('total sulfur dioxide')
plt8 = sns.boxplot(df['pH'], ax = axs[7]).set_title('pH')
plt9 = sns.boxplot(df['sulphates'], ax = axs[8]).set_title('sulphates')
plt10 = sns.boxplot(df['alcohol'], ax = axs[9]).set_title('alcohol')
plt.tight_layout()

Now we've removed the outliers to a certain extent. After removing the outliers, the shape of the data has also changed. The box plots above indicate this.

In [None]:
# Let's review correlations
# Let's see how quality is related with other variables using scatter plot. - df.columns
sns.pairplot(df, x_vars=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'], y_vars='quality', kind='scatter')
plt.tight_layout()

No conclusion on correlation can be drawn from the above analysis.

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df.corr(), annot = True, ax=ax)
plt.tight_layout()

There are correlations between
1) fixed acidity and citric acidity

2) fixed acidity and density

3) free sulfur dioxide and total sulfur dioxide

We can ignore these correlations as the correlation coefficients are not more than 0.7.

In [None]:
from collections import Counter
Counter(df['quality'])

In [None]:
sns.countplot(x='quality', data=df)

In [None]:
# Step 4: ML Model
from sklearn.model_selection import train_test_split
X = df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model_predictions = model.predict(X_test)

In [None]:
print(model.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print(mean_absolute_error(y_test, model_predictions))
print(mean_squared_error(y_test, model_predictions))
print(explained_variance_score(y_test, model_predictions))
print(r2_score(y_test, model_predictions))
rmse = mean_squared_error(y_test, model_predictions, squared=False)
print(rmse)

# Using PolynomialFeatures
https://machinelearningmastery.com/polynomial-features-transforms-for-machine-learning/

In [None]:
from sklearn.preprocessing import PolynomialFeatures

degree = 2 # Start with 2
poly = PolynomialFeatures(degree, include_bias=False)

In [None]:
X_poly = poly.fit_transform(X) # No longer a pandas dataframe
y_poly = y # Still a pandas dataframe

X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y_poly)

lr_poly = LinearRegression()
lr_poly.fit(X_poly_train, y_poly_train)

In [None]:
poly_predictions = lr_poly.predict(X_poly_test)

In [None]:
print(lr_poly.score(X_poly_test, y_poly_test))

When degree = 2, score = 0.35251269625913284

When degree = 3, score = -0.3196054000476425

In [None]:
print(mean_absolute_error(y_poly_test, poly_predictions))
print(mean_squared_error(y_poly_test, poly_predictions))
print(explained_variance_score(y_poly_test, poly_predictions))
print(r2_score(y_poly_test, poly_predictions))
rmse = mean_squared_error(y_poly_test, poly_predictions, squared=False)
print(rmse)

# Conclusion

As per the predicitons RMSE value is not very less. One thing to note in this example of Multiple Regression Model is that the 'quality' is a discrete variable as it has only 6 values, that's why the model didn't come up with better fit even after applying PolynomialFeatures.