# Index
1. Import Data & Packages


2. Analysis of Wine Quality


3. Data Quality & Missing Values


4. Exploratory Data Analysis


5. Correlation

    5.1 Red Wine
    
    5.2 White Wine
    
    5.3 Differences between White and Red Correlations
    
    
6. Predicting Quality: Linear Regression

    6.1 Split of Training and Hold-Out Data(80-20)
      
    6.2 LinReg on Data for both Wines
        
        6.2.1) White Wine
        
        6.2.2 Red Wine
        
    6.3 Combine Result for Testing.

# Introduction
The following notebook contains the steps enumerated below for analyzing characteristics of red and white variants of the Portuguese "Vinho Verde" wine. Quality is based on sensory scores (median of at least 3 evaluations made by wine experts). Each expert graded the wine quality between 0 (very bad) and 10 (very excellent).

# 1. Import Data & Packages

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import preprocessing
import matplotlib.pyplot as plt 

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error

%matplotlib inline

import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# 2. Analysis of Wine Quality

In [None]:
### Load wine quality data into Pandas
df_wine_red = pd.read_csv("/kaggle/input/winedata/winequality_red.csv")

df_wine_white = pd.read_csv("/kaggle/input/winedata/winequality_white.csv")

In [None]:
df_wine_red

In [None]:
df_wine_red['Color']='Red'
df_wine_white['Color']='White'

In [None]:
df_wine=pd.concat([df_wine_red,df_wine_white],axis=0)

In [None]:
df_wine.head(3)

In [None]:
df_wine.columns.values

In [None]:
df_wine.rename(columns={'fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity','citric acid':'citric_acid',
                        'residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide',
                        'total sulfur dioxide':'total_sulfur_dioxide' },inplace=True)

In [None]:
df_wine.columns.values

# 3. Data Quality & Missing Values

In [None]:
df_wine.isnull()

There are no missing values in this dateset.

In [None]:
df_wine.info()

In [None]:
df_wine.describe()

# 4. Exploratory Data Analysis

In [None]:
sns.countplot(x='quality',data=df_wine)

###  Red vs. White wines

In [None]:
mean_dic={'Color':['Red','White'],'Mean':[df_wine_red['quality'].mean(),df_wine_white['quality'].mean()]}
df_mean_qua=pd.DataFrame(data=mean_dic)
df_mean_qua.set_index('Color')

In [None]:
df_mean_qua.plot(kind='bar',x='Color',figsize=(10,7),legend=False)
plt.title('Average Quality by Wine Color')
plt.xlabel('mean_quality')
plt.ylabel('Average Quality')

In [None]:
df_Ph=pd.DataFrame({'f_ac':df_wine['fixed_acidity'],'pH':df_wine['pH'],'color':df_wine['Color']=='Red','size':df_wine['quality']})


In [None]:
plt.scatter('f_ac', 'pH',c='color',alpha=0.65,data=df_Ph)
plt.xlabel("fixed_acidity", size=16)
plt.ylabel("pH", size=16)


# 5. Correlation 

###            Red Wine

In [None]:
plt.subplots(figsize=(15,10))
ax = plt.axes()
ax.set_title("Red Wine Characteristic Correlation Heatmap")
corr = df_wine_red.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap="Reds")
plt.show()

###  White Wine

In [None]:
plt.subplots(figsize=(15,10))
ax = plt.axes()
ax.set_title("White Wine Characteristic Correlation Heatmap")
corr = df_wine_white.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap="Purples")

### Differences between White and Red Correlations 

In [None]:
df_red_corr=df_wine_red.corr()
df_red_corr

In [None]:
df_white_corr=df_wine_white.corr()
df_white_corr

In [None]:
diff_corr = df_red_corr - df_white_corr
diff_corr

In [None]:
plt.subplots(figsize=(15,10))
ax = plt.axes()
ax.set_title("Correlation Differences between Red and White Wines")
corr = diff_corr
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap="coolwarm")

### As you can see, the features that can correlate to quality of the red wine are

* fixed_acidity
* volatile_acidity
* citirc_acid
* chlorides
* total_sulfur_dioxide
* density
* sulphates
* alcohol

# 6. Predicting Quality: Linear Regression

###    Split of Training and Hold-Out Data(80-20)

In [None]:
df_wine_white.rename(columns={'fixed acidity': 'fixed_acidity','citric acid':'citric_acid','volatile acidity':'volatile_acidity','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide','total sulfur dioxide':'total_sulfur_dioxide'}, inplace=True)
df_wine_red.rename(columns={'fixed acidity': 'fixed_acidity','citric acid':'citric_acid','volatile acidity':'volatile_acidity','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide','total sulfur dioxide':'total_sulfur_dioxide'}, inplace=True)

In [None]:
df = pd.get_dummies(df_wine, columns=["Color"])

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=4) 
## add a starting random point (4) so results can be reproduced 

In [None]:
results1 = smf.ols('quality ~ total_sulfur_dioxide + free_sulfur_dioxide + residual_sugar + fixed_acidity + volatile_acidity + alcohol + sulphates + pH + density + Color_Red', data=df).fit()
print(results1.summary())

Note the warning:

We'll see how our out-of-sample test results perform (if there's a lot of multicollinearity present, we'd expect to see decreased performance)

In [None]:
y = train["quality"]
cols = ["total_sulfur_dioxide","free_sulfur_dioxide","residual_sugar","fixed_acidity","volatile_acidity","alcohol","sulphates","pH","density","Color_Red"]

X=train[cols]

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X,y)

In [None]:
ytrain_pred = regr.predict(X)
print("In-sample Mean squared error: %.2f"
      % mean_squared_error(y, ytrain_pred))

In [None]:
ytest = test["quality"]
cols = ["total_sulfur_dioxide","free_sulfur_dioxide","residual_sugar","fixed_acidity","volatile_acidity","alcohol","sulphates","pH","density","Color_Red"]

Xtest=test[cols]

In [None]:
ypred = regr.predict(Xtest)
print("Out-of-sample Mean squared error: %.2f"
      % mean_squared_error(ytest, ypred))

The out-of-sample MSE isn't too much higher than the train sample, which is a good indication that there isn't too much overfitting in our model.

# 7. LinReg on Data for both Wines

## White Wine Model 

In [None]:
w_train, w_test = train_test_split(df_wine_white, test_size=0.2)

In [None]:
results_w = smf.ols('quality ~ free_sulfur_dioxide + residual_sugar + fixed_acidity + volatile_acidity + alcohol + sulphates + pH + density', data=df_wine_white).fit()
print(results_w.summary())

In [None]:
y_w = w_train["quality"]
cols_w = ["free_sulfur_dioxide","residual_sugar","fixed_acidity","volatile_acidity","alcohol","sulphates","pH","density"]
X_w=w_train[cols_w]
regr.fit(X_w,y_w)

In [None]:
ytrain_predw = regr.predict(X_w)
print("In-sample Mean squared error: %.2f"
      % mean_squared_error(y_w, ytrain_predw))

In [None]:
ytestw = w_test["quality"]
Xtestw = w_test[cols_w]
ypredw = regr.predict(Xtestw)
print("Out-of-sample Mean squared error: %.2f"
      % mean_squared_error(ytestw, ypredw))

## Red Wine Model 

In [None]:
r_train, r_test = train_test_split(df_wine_red, test_size=0.2)

In [None]:
results_r = smf.ols('quality ~ free_sulfur_dioxide + residual_sugar + fixed_acidity + volatile_acidity + alcohol + sulphates + pH + density', data=df_wine_red).fit()
print(results_r.summary())

In [None]:
y_r = r_train["quality"]
cols_r = ["free_sulfur_dioxide","residual_sugar","fixed_acidity","volatile_acidity","alcohol","sulphates","pH","density"]
X_r=r_train[cols_r]
regr.fit(X_r,y_r)

In [None]:
ytrain_predr = regr.predict(X_r)
print("In-sample Mean squared error: %.2f"
      % mean_squared_error(y_r, ytrain_predr))

In [None]:
ytestr = r_test["quality"]
Xtestr = r_test[cols_r]
ypredr = regr.predict(Xtestr)
print("Out-of-sample Mean squared error: %.2f"
      % mean_squared_error(ytestr, ypredr))

### Combine result for testing

In [None]:
y_both = pd.concat([y_w,y_r])

In [None]:
ytrain_predW=pd.DataFrame(ytrain_predw)
ytrain_predR=pd.DataFrame(ytrain_predr)

y_train_predboth = pd.concat([ytrain_predW,ytrain_predR])

In [None]:
print("In-sample Mean squared error: %.2f"
      % mean_squared_error(y_both, y_train_predboth))

In [None]:
ytestboth = pd.concat([ytestw,ytestr])
Xtestboth = pd.concat([Xtestw,Xtestr])
                                          
ypredboth = pd.concat([pd.DataFrame(ypredw),pd.DataFrame(ypredr)])
print("Out-of-sample Mean squared error: %.2f"
      % mean_squared_error(ytestboth, ypredboth))
