In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style('darkgrid')

import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head()

In [None]:
data.shape

# **Checking null values**

In [None]:
print(data.isnull().sum())
sb.heatmap(data.isnull(),annot=True)

No Null Values Present is the data set

# **Exploratory data Analysis**

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(20,8))
sb.heatmap(data.corr(),annot=True)

*  Fixed Acidity,Citric Acid,Sulphates and Alcohol have strong positive corelation with wine quality
*  Volatie acidity,chlorides,total sulphur oxide and density have strong negetive corelation with wine quality
*  Residual Sugar,free sulfur oxide and ph do not have strong corelation with wine quality

**Quality Counts**

In [None]:
plt.figure(figsize = (25,7))
sb.set_palette('pastel')
sb.countplot(data=data,y='quality')

* Wine of quality 5 has the maximum count of close to 670
* Wine of quality 6 has the second most count approximately 640
* Wine of quality 7 comes third with approximate count of 199
* Wine of quality 4 comes fourth with approximate count of 53
* Wine of quality 8 comes fifth with approoximate count of 20
* Wine of quality 3 comes last with approximate count of 10

We can conclude from the above graph that as quality of wine has increased from 5-6-7-8 their quantities has decreased which shows that it is hard to find good quality of wine

In [None]:
sb.pairplot(data,hue='quality')

**How does fixed acidity affext quality of wine?**

In [None]:
#Distribution of fixed acidity
plt.figure(figsize = (20,7))
plt.hist(data['fixed acidity'],color = 'purple',edgecolor='black')
plt.xticks([ 4.6 ,  5.73,  6.86,  7.99,  9.12, 10.25, 11.38, 12.51, 13.64,
        14.77, 15.9])

**Majority of Wine have fixed acidity between 5.73 and 10.25
Lets see how it affects wine quality!**

In [None]:
plt.figure(figsize = (18,9))
plt.hist([data[data['quality']==3]['fixed acidity']],color='tomato',stacked=True,edgecolor='black')
plt.yticks([0,1,2,3])
plt.xticks([7.1,7.5,8.3,10.4,11.6])
plt.xlabel('fixed acidity')
plt.ylabel('Count')
plt.title('Fixed Acidity for wine of quality = 3 ')

**Average fixed acidity is 8.36 for wine of quality 3**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==4]['fixed acidity']],color='firebrick',edgecolor='black')
plt.yticks([2,4,6,8,10,12,14])
plt.xticks([ 4.6 ,  5.39,  6.18,  6.97,  7.76,  8.55,  9.34, 10.13, 10.92,11.71, 12.5 ])
plt.xlabel('fixed acidity')
plt.ylabel('Count')
plt.title("Fixed Acidity for wine of quality = 4")

**Average Fixed acidity concentration in wine of quality 4 is 7.77**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==5]['fixed acidity']],color='darkcyan',edgecolor='black')
plt.xticks([ 5.  ,  6.09,  7.18,  8.27,  9.36, 10.45, 11.54, 12.63, 13.72,14.81, 15.9 ])
plt.xlabel('fixed acidity')
plt.ylabel('Count')
plt.title("Fixed Acidity for wine of quality = 5")

**Average Fixed acidity concentration in wine of quality 5 is 8.16**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==6]['fixed acidity']],color='crimson',edgecolor='black')
plt.xticks([  4.7 ,  5.66,  6.62,  7.58,  8.54,  9.5 , 10.46, 11.42, 12.38,13.34, 14.3])
plt.xlabel('fixed acidity')
plt.ylabel('Count')
plt.title("Fixed Acidity for wine of quality = 6")

**Average Fixed acidity concentration in wine of quality 6 is 8.34**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==7]['fixed acidity']],color='goldenrod',edgecolor='black')
plt.xticks([  4.9 ,  5.97,  7.04,  8.11,  9.18, 10.25, 11.32, 12.39, 13.46,14.53, 15.6])
plt.xlabel('fixed acidity')
plt.ylabel('Count')
plt.title("Fixed Acidity for wine of quality = 7")

**Average Fixed acidity concentration in wine of quality 7 is maximum between 8.87**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==8]['fixed acidity']],color='skyblue',edgecolor='black')
plt.xticks([ 5.  ,  5.76,  6.52,  7.28,  8.04,  8.8 ,  9.56, 10.32, 11.08,11.84, 12.6 ])
plt.xlabel('fixed acidity')
plt.ylabel('Count')
plt.title("Fixed Acidity for wine of quality = 8")

**Avearge fixed acidity for wine of quality 8 is 8.566**

In [None]:
y = [data[data['quality']==3]['fixed acidity'].mean(),data[data['quality']==4]['fixed acidity'].mean(),data[data['quality']==5]['fixed acidity'].mean(),data[data['quality']==6]['fixed acidity'].mean(),data[data['quality']==7]['fixed acidity'].mean(),data[data['quality']==8]['fixed acidity'].mean()]
x = [3,4,5,6,7,8]
plt.figure(figsize = (20,7))
plt.plot(x,y,c='red',label = 'fixed acidity')
plt.legend()
plt.yticks([7.4,7.6,7.8,8.0,8.2,8.4,8.6,8.8])
plt.xlabel('Wine Quality')
plt.ylabel('Fixed Acidity')
plt.title('Fixed acidity in wine various wine qualities')

**For wine of quality 3  average fixed acidity is 8.36 it falls to 7.8 for wine of quality 4. it increases till wine of quality 7 which has maximum average fixed acidity 8.87 and falls to 8.56 for wine of quality 8** 

In [None]:
sb.regplot(data['fixed acidity'],data['quality'],color='green')

**We can see that in the above regression plot as fixed acidity increases the wine quality usually increases.**

**Volatile Acidity and Wine Quality**

In [None]:
plt.figure(figsize=(20,8))
plt.hist(data['volatile acidity'],color = 'purple',edgecolor='black')
plt.xticks([0.12 , 0.266, 0.412, 0.558, 0.704, 0.85 , 0.996, 1.142, 1.288,1.434, 1.58])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity")

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==3]['volatile acidity']],color='rebeccapurple',edgecolor='black')
plt.xticks([0.44 , 0.554, 0.668, 0.782, 0.896, 1.01 , 1.124, 1.238, 1.352,1.466, 1.58])
plt.yticks([0,1,2])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity for wine of quality = 3")

**On average volatile acidity of wine of quality 3 is 0.88**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==4]['volatile acidity']],color='seagreen',edgecolor='black')
plt.xticks([0.23, 0.32, 0.41, 0.5 , 0.59, 0.68, 0.77, 0.86, 0.95, 1.04, 1.13])
plt.yticks([2,4,6,8,10,12])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity for wine of quality = 4")

**On average volatile acidity of wine of quality 4 is 0.69**










In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==5]['volatile acidity']],color='mediumvioletred',edgecolor='black')

plt.xticks([0.18 , 0.295, 0.41 , 0.525, 0.64 , 0.755, 0.87 , 0.985, 1.1  ,1.215, 1.33 ])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity for wine of quality = 5")

**On average volatile acidity of wine of quality 5 is 0.577**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==6]['volatile acidity']],color='steelblue',edgecolor='black')

plt.xticks([0.16 , 0.248, 0.336, 0.424, 0.512, 0.6  , 0.688, 0.776, 0.864,0.952, 1.04 ])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity for wine of quality = 6")

**On average volatile acidity of wine of quality 6 is 0.49**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==7]['volatile acidity']],color='royalblue',edgecolor='black')

plt.xticks([0.12  , 0.1995, 0.279 , 0.3585, 0.438 , 0.5175, 0.597 , 0.6765,0.756 , 0.8355, 0.915])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity for wine of quality = 7")

**On average volatile acidity of wine of quality 7 is 0.4033**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==8]['volatile acidity']],color='darkorange',edgecolor='black')

plt.xticks([0.26 , 0.319, 0.378, 0.437, 0.496, 0.555, 0.614, 0.673, 0.732,0.791, 0.85])
plt.xlabel('volatile acidity')
plt.ylabel('Count')
plt.title("Volatile Acidity for wine of quality = 8")

**On average volatile acidity of wine of quality 8 is 0.4233**

In [None]:
x = [3,4,5,6,7,8]
y = [data[data['quality']==3]['volatile acidity'].mean(),data[data['quality']==4]['volatile acidity'].mean(),data[data['quality']==5]['volatile acidity'].mean(),data[data['quality']==6]['volatile acidity'].mean(),data[data['quality']==7]['volatile acidity'].mean(),data[data['quality']==8]['volatile acidity'].mean()]
plt.figure(figsize = (25,8))
plt.plot(x,y,c='darkblue',label = 'volatile acidity')
plt.xlabel('Wine Quality')
plt.ylabel('Volatile acidity')
plt.title('Volatile Acidity  wrt Quality')


In [None]:
sb.regplot(data['volatile acidity'],data['quality'],color='blueviolet')

In [None]:
plt.figure(figsize = (22,9))
sb.regplot(data['volatile acidity'],data['fixed acidity'],color='magenta')

**From the observations of above graphs and the regression plot we can clearly see that as volatile acidity increases wine quality decreases. Also with increase in fixed acidity volatile acidity decreases in vice-versa. Also as volatile acidity decreases fixed acidity increases and inturn wine quality increases**

**How does citric acid affect wine quality?**

In [None]:
plt.figure(figsize = (20,10))
plt.hist(data['citric acid'],color='deepskyblue',edgecolor = 'black')

In [None]:
plt.figure(figsize = (20,10))
sb.kdeplot(data['citric acid'],color = 'orangered')
plt.xlabel('citric acid')
plt.ylabel('density')
plt.xticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2])

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==3]['citric acid']],color='forestgreen',edgecolor='black')

plt.xticks([0., 0.066, 0.132, 0.198, 0.264, 0.33 , 0.396, 0.462, 0.528,0.594, 0.66 ])
plt.xlabel('citric acid')
plt.ylabel('Count')
plt.title("Citric for wine of quality = 3")

* 70% wine of quality 3 have citric acid concentration between 0.000 and 0.066
Average concentration is **0.1799** but this is due to presence of outliers.


In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==4]['citric acid']],color='deeppink',edgecolor='black')
plt.xticks([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.])
plt.xlabel('citric acid')
plt.ylabel('Count')
plt.title("Citric for wine of quality = 4")

**Average Citric acid concentration for wine of quality 4 is 0.1741 here outliers don't have a noticable effect**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==5]['citric acid']],color='slateblue',edgecolor='white')
plt.xticks([0., 0.079, 0.158, 0.237, 0.316, 0.395, 0.474, 0.553, 0.632,0.711, 0.79])
plt.xlabel('citric acid')
plt.ylabel('Count')
plt.title("Citric for wine of quality = 5")

**Average Citric acid concentration for wine of quality 5 is 0.2436**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==6]['citric acid']],color='navy',edgecolor='white')
plt.xticks([0., 0.078, 0.156, 0.234, 0.312, 0.39 , 0.468, 0.546, 0.624,0.702, 0.78 ])
plt.xlabel('citric acid')
plt.ylabel('Count')
plt.title("Citric for wine of quality = 6")

**Average Citric acid concentration for wine of quality 6 is 0.2738**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==7]['citric acid']],color='darkred',edgecolor='white')
plt.xticks([0., 0.076, 0.152, 0.228, 0.304, 0.38 , 0.456, 0.532, 0.608,0.684, 0.76 ])
plt.xlabel('citric acid')
plt.ylabel('Count')
plt.title("Citric for wine of quality = 7")

**Average Citric acid concentration for wine of quality 7 is 0.375**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==8]['citric acid']],color='darkred',edgecolor='white')
plt.xticks([0.03 , 0.099, 0.168, 0.237, 0.306, 0.375, 0.444, 0.513, 0.582,0.651, 0.72 ])
plt.xlabel('citric acid')
plt.ylabel('Count')
plt.title("Citric for wine of quality = 8")


**Average Citric acid concentration for wine of quality 8 is 0.3911**

In [None]:
x = [3,4,5,6,7,8,]
y = [data[data['quality']==3]['citric acid'].mean(),data[data['quality']==4]['citric acid'].mean(),data[data['quality']==5]['citric acid'].mean(),data[data['quality']==6]['citric acid'].mean(),data[data['quality']==7]['citric acid'].mean(),data[data['quality']==8]['citric acid'].mean()]
plt.figure(figsize = (25,8))
plt.plot(x,y,c='darkmagenta',label = 'citric acid')
plt.xlabel('wine quality')
plt.ylabel('citric acid average')
plt.legend()

**As Citric acid quantity increases wine quality also increases**

In [None]:
plt.figure(figsize=(25,8))
sb.regplot(data['citric acid'],data['volatile acidity'],color='salmon')

**With increase in citric acid concentration volatile acidity decreases thus wine quality increases**

In [None]:
plt.figure(figsize=(25,8))
sb.regplot(data['citric acid'],data['fixed acidity'],color='yellow')

**As citric acid concentration increases fixed acid concentration increases thus wine quality also increases**

**How does chlorides affect wine quality?**

In [None]:
plt.figure(figsize = (25,9))
sb.set_palette('dark')
sb.barplot(x = 'quality',y='chlorides',data = data)

In [None]:
plt.figure(figsize = (20,8))
sb.regplot(data['quality'],data['chlorides'])

**As chloride concentration increases quality decreases**

**How does free and total sulphur oxide affect total wine quality?**

free sulphur oxide has a very small corelation with wine quality but has a high corelation with total sulphur oxide which has a very high corelation with wine quality

In [None]:
plt.figure(figsize = (20,10))
sb.regplot(data['free sulfur dioxide'],data['total sulfur dioxide'])


**As free sulfur dioxide levels increase total sulfur dioxide also increases**

In [None]:
plt.figure(figsize =  (20,9))
sb.kdeplot(data['total sulfur dioxide'],color = 'greenyellow')
plt.title('kernel density estimate for total sulfur dioxide')

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==3]['total sulfur dioxide']],color='mediumaquamarine',edgecolor='black')
plt.xticks([ 9., 13., 17., 21., 25., 29., 33., 37., 41., 45., 49.])
plt.xlabel('total sulfur dioxide')
plt.ylabel('Count')
plt.title("total sufur dioxide for wine of qualitiy 3")

**average sulfur dioxide concentration is 24.9 for wine of quality 3**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==4]['total sulfur dioxide']],color='mediumaquamarine',edgecolor='black')
plt.xticks([  7. ,  18.2,  29.4,  40.6,  51.8,  63. ,  74.2,  85.4,  96.6,107.8, 119.])
plt.xlabel('total sulfur dioxide')
plt.ylabel('Count')
plt.title("total sufur dioxide for wine of qualitiy 4")

**average sulfur dioxide concentration is 36.24 for wine of quality 4**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==5]['total sulfur dioxide']],color='lavenderblush',edgecolor='red')
plt.xticks([   6. ,  20.9,  35.8,  50.7,  65.6,  80.5,  95.4, 110.3, 125.2,140.1, 155.])
plt.xlabel('total sulfur dioxide')
plt.ylabel('Count')
plt.title("total sufur dioxide for wine of qualitiy 5")

**average sulfur dioxide concentration is 56.51 for wine of quality 5**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==6]['total sulfur dioxide']],color='ghostwhite',edgecolor='skyblue')
plt.xticks([ 6. ,  21.9,  37.8,  53.7,  69.6,  85.5, 101.4, 117.3, 133.2,149.1, 165. ])
plt.xlabel('total sulfur dioxide')
plt.ylabel('Count')
plt.title("total sufur dioxide for wine of qualitiy 6")

**average sulfur dioxide concentration is 40.86 for wine of quality 6**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==7]['total sulfur dioxide']],color='honeydew',edgecolor='lawngreen')
plt.xticks([ 7. ,  35.2,  63.4,  91.6, 119.8, 148. , 176.2, 204.4, 232.6,260.8, 289. ])
plt.xlabel('total sulfur dioxide')
plt.ylabel('Count')
plt.title("total sufur dioxide for wine of qualitiy 7")

**average sulfur dioxide concentration is 35.02 for wine of quality 7**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==8]['total sulfur dioxide']],color='mintcream',edgecolor='springgreen')
plt.xticks([12. , 19.6, 27.2, 34.8, 42.4, 50. , 57.6, 65.2, 72.8, 80.4, 88.])
plt.xlabel('total sulfur dioxide')
plt.ylabel('Count')
plt.title("total sufur dioxide for wine of qualitiy 8")

**average sulfur dioxide concentration is 33.44 for wine of quality 8**

In [None]:
x = [3,4,5,6,7,8]
y = [data[data['quality']==3]['total sulfur dioxide'].mean(),data[data['quality']==4]['total sulfur dioxide'].mean(),data[data['quality']==5]['total sulfur dioxide'].mean(),data[data['quality']==6]['total sulfur dioxide'].mean(),data[data['quality']==7]['total sulfur dioxide'].mean(),data[data['quality']==8]['total sulfur dioxide'].mean()]
plt.figure(figsize = (25,9))
plt.plot(x,y,c='crimson',label = 'average total sulfur oxide concentration')
plt.xlabel('wine quality')
plt.ylabel('average of total sulfur dioxide')
plt.legend()
plt.title('Total sulfur dioxide and wine quality')

**from the above graph initially as total sulfur dioxide level increases wine quality increases but after quality 5 as wine quality increases sulphur dioxide level decreases**

In [None]:
plt.figure(figsize= (25,9))
sb.regplot(data['total sulfur dioxide'],data['fixed acidity'],color = 'indigo')

**As total sulfur dioxide level decreases fixed acidity increases thus wine quality increases**

**How Density affects wine quality!**

In [None]:
plt.figure(figsize = (25,9))
sb.kdeplot(data['density'],color='yellow')

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==3]['density']],color='lightgrey',edgecolor='darkgrey')
plt.xticks([0.99471 , 0.995319, 0.995928, 0.996537, 0.997146, 0.997755,0.998364, 0.998973, 0.999582, 1.000191, 1.0008  ])
plt.xlabel('density')
plt.ylabel('Count')
plt.title("density for wine of qualitiy 3")

**Average Density of wine of quality 3 is 0.997**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==4]['density']],color='orange',edgecolor='darkgrey')
plt.xticks([0.9934 , 0.99416, 0.99492, 0.99568, 0.99644, 0.9972 , 0.99796,0.99872, 0.99948, 1.00024, 1.001 ])
plt.xlabel('density')
plt.ylabel('Count')
plt.title("density for wine of qualitiy 4")

**average density of wine of quality 4 is 0.996**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==5]['density']],color='orange',edgecolor='darkgrey')
plt.xticks([0.99256 , 0.993619, 0.994678, 0.995737, 0.996796, 0.997855,0.998914, 0.999973, 1.001032, 1.002091, 1.00315])
plt.xlabel('density')
plt.ylabel('Count')
plt.title("density for wine of qualitiy 5")

**average density of wine of quality 5 is 0.997**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==6]['density']],color='azure',edgecolor='red')
plt.xticks([0.99007 , 0.991432, 0.992794, 0.994156, 0.995518, 0.99688 ,0.998242, 0.999604, 1.000966, 1.002328, 1.00369])
plt.xlabel('density')
plt.ylabel('Count')
plt.title("density for wine of qualitiy 6")

**The average density of wine of quality 6 is 0.996**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==7]['density']],color='thistle',edgecolor='violet')
plt.xticks([0.99064 , 0.991896, 0.993152, 0.994408, 0.995664, 0.99692 ,0.998176, 0.999432, 1.000688, 1.001944, 1.0032 ])
plt.xlabel('density')
plt.ylabel('Count')
plt.title("density for wine of qualitiy 7")

**Average density of wine of quality 7 is 0.996**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==8]['density']],color='thistle',edgecolor='violet')
plt.xticks([0.9908, 0.9916, 0.9924, 0.9932, 0.994 , 0.9948, 0.9956, 0.9964,0.9972, 0.998 , 0.9988])
plt.xlabel('density')
plt.ylabel('Count')
plt.title("density for wine of qualitiy 8")

**Average density of wine of quality 8 is 0.995**

In [None]:
x = [3,4,5,6,7,8]
y = [data[data['quality']==3]['density'].mean(),data[data['quality']==4]['density'].mean(),data[data['quality']==5]['density'].mean(),data[data['quality']==6]['density'].mean(),data[data['quality']==7]['density'].mean(),data[data['quality']==8]['density'].mean()]
plt.figure(figsize = (25,9))
plt.plot(x,y,c='dodgerblue')
plt.xlabel('quality')
plt.ylabel('density')
plt.title('Density vs Quality')

**From the above figure density decreases from wine of quality 3 to that of quality 4 but increases at quality 5. this might be because of number of samples of quality 5 is more and the density keeps on decreasing as quality increases**

In [None]:
plt.figure(figsize = (22,10))
sb.regplot(data['density'],data['quality'],color='skyblue')

In [None]:
plt.figure(figsize=(25,8))
sb.regplot(data['density'],data['chlorides'],color='mediumslateblue')

**As Clorides increases density increases**

In [None]:
plt.figure(figsize = (25,9))
sb.regplot(data['density'],data['citric acid'],color = 'lightcoral')

**As density increases citric acid concentration increases thus wine quality increases**

In [None]:
plt.figure(figsize = (25,9))
sb.regplot(data['density'],data['fixed acidity'],color = 'hotpink')

**As density increases fixed acidity increases thus wine quality increases**

**Redidual sugar doesnt have a significant corelation with wine quality but it does have with some other parameters which should be analyzed** 

In [None]:
plt.figure(figsize=(25,9))
sb.kdeplot(data['residual sugar'],color='rebeccapurple')

In [None]:
plt.figure(figsize = (25,9))
sb.regplot(data['residual sugar'],data['density'],color = 'cornflowerblue')

**As Residual sugar increases density incraese thus wine quality decreases**

In [None]:
plt.figure(figsize = (25,9))
sb.regplot(data['residual sugar'],data['total sulfur dioxide'],color = 'yellow')

**As residual sugar increases ttotal sulfur dioxide concentration inccreases thus quality decreases**

In [None]:
plt.figure(figsize = (25,9))
sb.regplot(data['residual sugar'],data['fixed acidity'],color = 'cornflowerblue')

**as residual sugar increases fixed acidity increases thus wine quality increases**

**Due to residual sugar certain parameters increase of which decreases wine quality and certain parameters increase of which increases wine quality balance out thus residual sugar doesnt have a significant impact on wine quality**

**What does Ph of wine indicate about its quality?**

In [None]:
plt.figure(figsize = (25,9))
plt.hist(data['pH'],color='palevioletred',edgecolor='blue')
plt.xlabel('pH')
plt.ylabel('count')

In [None]:
plt.figure(figsize = (25,9))
sb.barplot(x='quality',y='pH',data = data)

**pH doesnt have a direct effect on wine quality**

In [None]:
plt.figure(figsize=(25,9))
sb.regplot(data['pH'],data['fixed acidity'],color='lime')
plt.title('pH and fixed acidity')

**As ph increases fixed acidity decreases acid becomes base.** ,Thus wine quality decreases

In [None]:
plt.figure(figsize=(25,9))
sb.regplot(data['pH'],data['volatile acidity'],color='tomato')
plt.title('pH and volatile acidity')

As ph increases volatile acidity increases thus wine quality decreases

**Sulphate and wine quality**

In [None]:
plt.figure(figsize=(25,9))
plt.hist(data['sulphates'],color='lavender',edgecolor='midnightblue')

In [None]:
plt.figure(figsize = (20,10))
sb.barplot(x = 'quality',y = 'sulphates',data=data)

In [None]:
plt.figure(figsize=(25,10))
sb.regplot(data['sulphates'],data['quality'],color='lime')

**With increase in sulphates wine quality also increases**

**How does alcohol concentration affect wine quality?**

In [None]:
plt.figure(figsize = (25,9))
plt.hist(data['alcohol'],color='skyblue')

In [None]:
plt.figure(figsize = (25,10))
sb.kdeplot(data['alcohol'],color='skyblue')

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==3]['alcohol']],color='skyblue',edgecolor='azure')
plt.xticks([8.4 ,  8.66,  8.92,  9.18,  9.44,  9.7 ,  9.96, 10.22, 10.48,10.74, 11.])
plt.xlabel('alcohol')
plt.ylabel('Count')
plt.title("alcohol in wine of qualitiy 3")

**Average alcohol concentration in wine of quality 3 is 9.995**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==4]['alcohol']],color='snow',edgecolor='azure')
plt.xticks([ 9.  ,  9.41,  9.82, 10.23, 10.64, 11.05, 11.46, 11.87, 12.28,12.69, 13.1 ])
plt.xlabel('alcohol')
plt.ylabel('Count')
plt.title("alcohol in wine of qualitiy 4")

**Average alcohol concentration in wine of quality 4 is 10.265**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==5]['alcohol']],color='hotpink',edgecolor='lavenderblush')
plt.xticks([ 8.5 ,  9.14,  9.78, 10.42, 11.06, 11.7 , 12.34, 12.98, 13.62,14.26, 14.9  ])
plt.xlabel('alcohol')
plt.ylabel('Count')
plt.title("alcohol in wine of qualitiy 5")

**Average alcohol concentration in wine of quality 5 is 9.8997**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==6]['alcohol']],color='orangered',edgecolor='springgreen')
plt.xticks([ 8.4 ,  8.96,  9.52, 10.08, 10.64, 11.2 , 11.76, 12.32, 12.88,13.44, 14.  ])
plt.xlabel('alcohol')
plt.ylabel('Count')
plt.title("alcohol in wine of qualitiy 6")

**Average alcohol concentration in wine of quality 6 is 10.6295**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==7]['alcohol']],color='gainsboro',edgecolor='springgreen')
plt.xticks([9.2 ,  9.68, 10.16, 10.64, 11.12, 11.6 , 12.08, 12.56, 13.04,13.52, 14.   ])
plt.xlabel('alcohol')
plt.ylabel('Count')
plt.title("alcohol in wine of qualitiy 7")

**Average alcohol concentration in wine of quality 7 is 11.46**

In [None]:
plt.figure(figsize = (20,8))
plt.hist([data[data['quality']==8]['alcohol']],color='magenta',edgecolor='dodgerblue')
plt.xticks([9.8 , 10.22, 10.64, 11.06, 11.48, 11.9 , 12.32, 12.74, 13.16,13.58, 14.])
plt.xlabel('alcohol')
plt.ylabel('Count')
plt.title("alcohol in wine of qualitiy 8")

**Average alcohol concentration in wine of quality 8 is 12.09**

In [None]:
x = [3,4,5,6,7,8]
y = [data[data['quality']==3]['alcohol'].mean(),data[data['quality']==4]['alcohol'].mean(),data[data['quality']==5]['alcohol'].mean(),data[data['quality']==6]['alcohol'].mean(),data[data['quality']==7]['alcohol'].mean(),data[data['quality']==8]['alcohol'].mean()]
plt.figure(figsize = (25,10))
plt.plot(x,y,color='mediumspringgreen',label='average alcohol')
plt.xlabel('wine quality')
plt.ylabel('average alcohol concentration')
plt.legend()
plt.title('average alcohol in wine')

In [None]:
plt.figure(figsize=(25,10))
sb.regplot(data['alcohol'],data['quality'],color = 'blueviolet')

**With increase in alcohol level wine quality increases**

# Data Pre-Processing

In [None]:
bins = (2,5,8)
labels = ['bad','good']
data['quality'] = pd.cut(data['quality'],bins = bins,labels = labels)
data

In [None]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
data['quality'] = lb.fit_transform(data['quality'])

In [None]:
plt.figure(figsize = (20,10))
sb.countplot(data['quality'])

In [None]:
data

# Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
data


In [None]:
y = data['quality']
x = data[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = .25,random_state=42)

Logistic Regression

In [None]:
clf1 = LogisticRegression(C=90,max_iter=2000)
clf1.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf1,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf1,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf1,X_test,y_test,cv=4,scoring='accuracy').mean())

KNN

In [None]:
param_grid = {'n_neighbors':np.arange(1,10)}
grid = GridSearchCV(KNeighborsClassifier(),param_grid,cv=5)
grid.fit(x,y)

In [None]:
grid.best_params_

In [None]:
clf2 = grid.best_estimator_

In [None]:
clf2.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf2,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf2,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf2,X_test,y_test,cv=4,scoring='accuracy').mean())

**Gaussian Naive Bayes**

In [None]:
clf3 = GaussianNB()
clf3.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf3,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf3,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf3,X_test,y_test,cv=4,scoring='accuracy').mean())

**DecisionTreeClassifier**

In [None]:
param_grid = {'max_depth':np.arange(1,10),'min_samples_leaf': np.arange(1,5),'random_state':np.arange(1,50)}
grid = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=3)
grid.fit(x,y)

In [None]:
grid.best_params_

In [None]:
clf4 = grid.best_estimator_
clf4.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf4,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf4,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf4,X_test,y_test,cv=4,scoring='accuracy').mean())

**RandomForestClassifier**

In [None]:
param_grid = {'max_depth':np.arange(1,10),'min_samples_leaf': np.arange(1,5)}
grid = GridSearchCV(RandomForestClassifier(n_jobs=-1),param_grid,cv=3)
grid.fit(x,y)

In [None]:
grid.best_params_

In [None]:
clf5 = RandomForestClassifier(max_depth=7,min_samples_leaf=3,n_jobs=-1,min_samples_split=3,)

In [None]:
clf5.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf5,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf5,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf5,X_test,y_test,cv=4,scoring='accuracy').mean())

**BaggingClassifier**

In [None]:
clf6 = BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth = 4,random_state=1,min_samples_leaf=1),bootstrap=True)
clf6.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf6,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf6,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf6,X_test,y_test,cv=4,scoring='accuracy').mean())

**AdaBoostClassifier**

In [None]:
clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 4,random_state=1))
clf7.fit(X_train,y_train)

In [None]:
y_train_pred = cross_val_predict(clf7,X_train,y_train,cv=4)
print("Confusion Matrix: \n",confusion_matrix(y_train,y_train_pred))
print('Precision Score:',precision_score(y_train,y_train_pred))
print("Recall Score:",recall_score(y_train,y_train_pred))
print("Accuracy Score:",accuracy_score(y_train,y_train_pred))
print("Cross Val Score Insample",cross_val_score(clf7,X_train,y_train,cv=4,scoring='accuracy').mean())
print("Cross Val Score Outsample",cross_val_score(clf7,X_test,y_test,cv=4,scoring='accuracy').mean())

# **Model Development and Deployement**

In [None]:
classifier = RandomForestClassifier(max_depth=7,min_samples_leaf=3,n_jobs=-1,min_samples_split=3)
classifier.fit(x,y)

x = data[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]

In [None]:
data

In [None]:
wine = [[7.62,0.885,0.43,2.3,0.0032,23,56,0.9867,3.56,0.77,10.99]]
if(classifier.predict(wine) ==1):
    print("Good quality wine")
else:
    print("bad quality wine")