In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/wine-quality/winequalityN.csv')

In [None]:
df.head()

# Exploratory Data Analysis

In [None]:
df.shape

In [None]:
# Let's check data distribution of wine type.
sns.countplot(x='type',data=df)

In [None]:
# Clearly, we have data imbalance here. 
df['type'].value_counts()

In [None]:
df_train = df[df.type=='white'].sample(1599,random_state=0).append(df[df.type=='red'].sample(1599,random_state=0))

In [None]:
df_train = df_train.sample(frac=1)

In [None]:
df_train.reset_index(inplace=True,drop=True)

In [None]:
df_train.head()

In [None]:
sns.countplot(x='type',data=df_train)

In [None]:
# numerical columns
numeric_data = [feature for feature in df_train.columns if df_train[feature].dtypes !='O' and feature not in 'quality']

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='type',hue='quality',data=df_train)

In [None]:
# 5, 6 contribute to majority of distribution 
# Quality ranges from 0-10.
# We can classify them into 3 major classes i.e. 0-4 --> Bad, 5-6 --> Average, 7-10 --> Good

In [None]:
# Let's see the distribution of data with Quality ( on red and white wine )
# Also, we can develop some sense of 'Suspected outliers'

In [None]:
for feature in numeric_data:
    sns.catplot(x='quality',y=feature,col='type',data=df_train)
    plt.show()

#### INFERENCES b/w Red-White:
1. Fixed Acidity : Red (4 - 16) > White (4 - 10)
2. Volatility acidity: Red (0.2 - 1.4) > White (0.1 - 1.0)
3. Residual sugar : Red < White 
4. Free sulphur dioxide : Red < White
5. Total sulphur dioxide : Red < White
6. Sulphates : Red > White

#### Visually confirmed outliers:
1. White wine (Q-6) high fixed acidity ( could be red )
2. White wine (Q-6) residual sugar ( neither red nor white has similar value )

In [None]:
for feature in numeric_data:
    sns.boxplot(x='quality',y=feature,data=df_train)
    plt.show()

#### INFERENCES wrt quality:
Comparision with median is a better measure as it isn't affected by the outliers.
1. Fixed Acidity :     similar for all qualities
2. Volatile acidity :  lower the volatile acidity better the wine quality
3. Citric acid :       slighlty on higher side for good quality wine
4. Residual sugar
5. Chlorides :         lower the chlorides better the wine quality
6. Free/Total sulphur dioxide : Higher the sulphur dioxide content better the quality
7. pH/Density : similar values for all
8. Alcohol : higher alcohol content results in better quality

In [None]:
df_train.groupby('quality').median()

In [None]:
# Correlation b/w features:
plt.figure(figsize=(6,6))
corrmat = df.corr(method='spearman')
sns.heatmap(corrmat)

In [None]:
# There isn't any strong correlation between features and target variable. 
# Alcohol shows moderate correlation.
# Density, Chlorides, volatile acidity show negative correlation which we also interpreted from inferences made using boxplot