# Exploratory Data Analysis (EDA) :
EDA is an approach for summarizing, visualizing, and becoming intimately familiar with the important characteristics of a data set, and its performed in order to define and refine the selection of feature variables that will be used for machine learning.

Exploratory Data Analysis is majorly performed using the following methods:

   ## Univariate analysis:
    provides summary statistics for each field in the raw data set (or) summary only on one variable.Ex:- CDF,PDF,Box plot, Violin plot.(dont worry, will see below what each of them is)
    
   ## Bivariate analysis:
    is performed to find the relationship between each variable in the dataset and the target variable of  interest (or) using 2 variables and finding realtionship between them.Ex:-Box plot,Voilin plot.
    
   ## Multivariate analysis:
    is performed to understand interactions between different fields in the dataset (or) finding interactions between variables more than 2. Ex:- Pair plot and 3D scatter plot.



In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
%matplotlib inline

In [None]:
#loading_data
data=pd.read_csv("../input/wine-quality/winequalityN.csv")
data.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
#data.drop(["a"], axis=1, inplace=True)
data.head()

# Univariate Analysis

In [None]:
#lets get an overview and some statistics of dataset
data.describe()

In [None]:
#number of rows and columns
data.shape

In [None]:
data["quality"].value_counts()

In [None]:
Global_Salesfig, axs = plt.subplots(ncols = 3, figsize=(13, 4))
sns.distplot (data["fixed acidity"],ax=axs[0],kde=True)
sns.distplot(data["volatile acidity"],ax=axs[1],kde=True)
sns.distplot(data["citric acid"],ax=axs[2],kde=True)

In [None]:
Global_Salesfig, axs = plt.subplots(ncols = 2, figsize=(13, 4))
sns.distplot (data["free sulfur dioxide"],ax=axs[0],kde=True)
sns.distplot(data["chlorides"],ax=axs[1],kde=True)

In [None]:
cols=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']
for feature in cols:
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

# Bivariate Analysis

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(orient='v',data=data,y="pH",x="quality")
sns.swarmplot(data=data,y="pH",x="quality", color=".25")
plt.show()
cols.remove('quality')
for col in cols:
    plt.figure(figsize=(20,10))  
    sns.boxplot(orient='v',data=data,y=col,x="quality")
    plt.show()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(), annot = True,fmt='.1g', vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')

From above zoomed heatmap it is observed that fixed.acidity & density are closely correlated . Similarly TotalBsmtSF and 1stFlrSF are also closely correlated.

## My observations :

* 'citric.acid', 'density' and 'pH' are strongly correlated with 'fixed.acidity'.

* 'free.sulfur.dioxide' and 'total.sulfur.dioxide' also seem to be twins. In this case let us keep 'total.sulfur.dioxide'

* 'citric.acid' it appears like is slightly correlated with 'volatile.acidity'. This required more analysis to arrive at a conclusion.

In [None]:
sns.set()
sns.pairplot(data,size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()

In [None]:
fig, ((ax1, ax2), (ax3, ax4),(ax5,ax6)) = plt.subplots(nrows=3, ncols=2, figsize=(14,10))

sns.regplot(x="fixed acidity",y="density",data=data,ax=ax1,scatter_kws={'s':2})
sns.regplot(x="fixed acidity",y="pH",data=data,ax=ax2,scatter_kws={'s':2})
sns.regplot(x="fixed acidity",y="citric acid",data=data,ax=ax3,scatter_kws={'s':2})
sns.regplot(x="volatile acidity",y="citric acid",data=data,ax=ax4,scatter_kws={'s':2})
sns.regplot(x="free sulfur dioxide",y="total sulfur dioxide",data=data,ax=ax5,scatter_kws={'s':2})
sns.regplot(x="alcohol",y="quality",data=data,ax=ax6,scatter_kws={'s':2})

# Multivariate Analysis

In [None]:
normalised_df = pd.DataFrame()
for col in cols:
    normalised_df[col] = data[col]
    
normalised_df = (normalised_df - normalised_df.mean()) / normalised_df.std()#(normalised_df.max() - normalised_df.min()) 
normalised_df['quality'] = data['quality']


#Plot
plt.figure(figsize=(15, 9))
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['fixed acidity'], color = 'blue', label='fixed acidity')
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['citric acid'], color = 'red', label='citric acid')
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['volatile acidity'], color = 'green', label='volatile acidity')

blue_patch = mpatches.Patch(color='blue', label='fixed acidity')
red_patch = mpatches.Patch(color='red', label='citric acid')
green_patch = mpatches.Patch(color='green', label='volatile acidity')
plt.legend(handles=[blue_patch, red_patch,green_patch], loc='upper left', fontsize = 16)
plt.show()

plt.figure(figsize=(15, 9))
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['density'], color = 'blue', label='density')
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['alcohol'], color = 'red', label='alcohol')

blue_patch = mpatches.Patch(color='blue', label='density')
red_patch = mpatches.Patch(color='red', label='alcohol')
plt.legend(handles=[blue_patch, red_patch], loc='upper left', fontsize = 16)
plt.show()



plt.figure(figsize=(15, 9))
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['free sulfur dioxide'], color = 'blue', label='free sulfur dioxide')
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['total sulfur dioxide'], color = 'red', label='total sulfur dioxide')

blue_patch = mpatches.Patch(color='blue', label='free.sulfur.dioxide')
red_patch = mpatches.Patch(color='red', label='total.sulfur.dioxide')
plt.legend(handles=[blue_patch, red_patch], loc='upper left', fontsize = 16)
plt.show()

plt.figure(figsize=(15, 9))
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['fixed acidity'], color = 'blue', label='fixed acidity')
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['pH'], color = 'red', label='pH')
ax = sns.pointplot(x = normalised_df['quality'], y = normalised_df['density'], color = 'green', label='density')

blue_patch = mpatches.Patch(color='blue', label='fixed acidity')
red_patch = mpatches.Patch(color='red', label='pH')
green_patch = mpatches.Patch(color='green', label='density')
plt.legend(handles=[blue_patch, red_patch,green_patch], loc='upper left', fontsize = 16)
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

scaled_data=preprocessing.scale(data.T)
pca=PCA()
pca.fit(scaled_data) # do the math
pca_data = pca.transform(scaled_data) 


In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
 
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
cols=['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol','quality']

pca_df = pd.DataFrame(pca_data,index=cols, columns=labels)
#plt.figure(figsize=(10, 30))
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))


for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
 
plt.show()