In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline


In [None]:
df= pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

## Exploratory Data Analysis and Visualization

In [None]:
df.describe()

In [None]:
df['quality'].plot(kind='hist')

In [None]:
sns.boxplot(x='quality', y='pH',data=df)

In [None]:
correlation= df.corr()
plt.figure(figsize=(30,10))
sns.heatmap(correlation,annot=True)

In [None]:
plt.figure(figsize=(5,20))
sns.heatmap(correlation[['quality']].sort_values(by=['quality'], ascending=False), annot=True)

In [None]:
sns.regplot(x='quality', y='volatile acidity', data=df)

In [None]:
sns.regplot(x='quality', y='alcohol', data=df)

In [None]:
sns.regplot(x='quality', y='pH', data=df)

In [None]:
sns.regplot(x='quality', y='sulphates', data=df)

In [None]:
sns.regplot(x='quality', y='total sulfur dioxide', data=df)

In [None]:
sns.regplot(x='quality', y='citric acid', data=df)

__From these visualizations and correlation maps we see that 'alcohol', 'volatile acidity', 'sulphates' and 'total sulfur dioxide' are mostly affecting the quality of wine__

Dividing quality into 2 categories: good and bad 
1. Good: Quality>6.5
2. Bad: Quality<6.5

In [None]:
bins= (2,6.5,8)
group_names=['bad','good']
df['quality']= pd.cut(df['quality'], bins= bins, labels= group_names)
df.head()


In [None]:
from sklearn.preprocessing import LabelEncoder
label= LabelEncoder()
df['quality']= label.fit_transform(df['quality'])

In [None]:
sns.countplot(df['quality'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
y= df['quality']
y.head()

In [None]:
x= df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]
x.head()
x.shape

In [None]:
train_x, test_x, train_y, test_y= train_test_split(x,y, test_size= 0.2, random_state=2)

## Decision tree Classifier

In [None]:
Ctree= DecisionTreeClassifier(criterion='entropy')
Ctree.fit(train_x,train_y)

In [None]:
predtree= Ctree.predict(test_x)
print(predtree[:5])
print(test_y[:5])

In [None]:
from sklearn.metrics import accuracy_score
print("Decision tree accuracy: {}". format(accuracy_score(test_y, predtree)))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model= RandomForestClassifier(n_estimators=100, bootstrap=True, max_features='sqrt')
model.fit(train_x,train_y)

In [None]:
pred_forest= model.predict(test_x)
pred_prob= model.predict_proba(test_x)[:,1]

In [None]:
print("Random Forest accuracy: {}". format(accuracy_score(test_y, pred_forest)))

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC
svc= SVC()
svc.fit(train_x, train_y)

In [None]:
pred_svc= svc.predict(test_x)
print("SVM accuracy: {}". format(accuracy_score(test_y, pred_svc)))