In [None]:
from IPython.display import Image
import pandas as pd
from pandas import Series,DataFrame


In [None]:
wineDF=pd.read_csv('../input/winequality-red.csv')

In [None]:
wineDF.head()

### Checking for missing data :

In [None]:
wineDF.info()

### There are no missing values and we can start with EDA.

# Part-I : Exploratory Data Analysis :

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
wineDF.shape

In [None]:
sns.factorplot('quality',data=wineDF,kind='count')

## Most of the wines in this dataset seem to have quality scores of 5 or 6 .

### Lets classify the wines based on quality even further to make classification more easy :

#### So we add a new feature called "Reviews" which divides the wines into "Good" or "Bad" giving them a value of "0" if wine quality is <= 6 or "1" if wine quality is >6.

In [None]:
reviews=[]
for i in wineDF["quality"]:
    if i <= 6:
        reviews.append(0)
    else:
        reviews.append(1)
        
wineDF["Reviews"] = reviews
        

In [None]:
sns.countplot(wineDF['Reviews'])

### Almost 1400 of the total number of wines seem to be "Bad" and the remaining 200 wines "Good"

In [None]:
wineDF.head()

In [None]:
wineDF.hist(figsize=(20,20), color='red')
plt.show()

## Correlation Plot :

In [None]:
mycor= wineDF.corr()
plt.subplots(figsize=(12,12)) #INCREASE HEATMAP SIZE
sns.heatmap(mycor,annot=True)

## Alcohol has the maximum correlation with quality followed by sulphates and citric acid and then fixed acidity


In [None]:
sns.jointplot(x='quality',y='alcohol',data=wineDF,kind='scatter')

In [None]:
sns.factorplot('quality','alcohol',data=wineDF)

### Above Joint Plot shows correlation between alcohol and quality and has a person's coefficient of 0.48.

In [None]:
sns.jointplot(x='quality',y='sulphates',data=wineDF,kind='scatter')

In [None]:
sns.jointplot(x='quality',y='citric acid',data=wineDF,kind='scatter')

In [None]:
sns.jointplot(x='quality',y='fixed acidity',data=wineDF,kind='scatter')

# Part-II : Machine Learning

# I have selected the following classification algorithms to classify the wines:
### - Random Forest Classifier
### - Support Vector Classifier

In [None]:
# Image("images/random forest.png")

In [None]:
# Image("images/svm.jpg")

# Applying Random Forest Classifier (using Scikit Learn) :- 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC

In [None]:
X = wineDF.drop(["quality","Reviews"],axis = 1 )
y = wineDF["quality"]
X.head()

In [None]:
y.head()

### Spliting the dataset into Test Data and Train Data :

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.head()

### Using Standard scaling on X :

In [None]:
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

In [None]:
pred_rfc = rfc.predict(X_test)

In [None]:
pred_rfc

In [None]:
print(classification_report(y_test, pred_rfc))

### We get around 60% accuracy when training the model with "quality" .

In [None]:
rfc_eval = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
rfc_eval.mean()

### Training the model again , but this time with "Reviews" instead of "Quality"

In [None]:
y = wineDF["Reviews"]

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.head()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
X_train

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)


In [None]:
pred_rfc = rfc.predict(X_test)

In [None]:
pred_rfc

In [None]:
print(classification_report(y_test, pred_rfc))

### This time, we get 87-88% accuracy because of the binary nature of classification.

# Applying Support Vector Classifier :-

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
pred_svc = svc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_svc))

### Support Vector Classifier gives 86% accuracy.


## So, Random Forest Algorithm gives slightly better results than SVC.

## Optimization of the model :

### Using Grid Search CV 


#### Step 1 : Finding the best parameters for our SVC model.

In [None]:

param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_svc.fit(X_train, y_train)


#### Step 2: Printing the best parameters

In [None]:
grid_svc.best_params_

#### Now that we have the best parameters, lets run our SVC model using those parameters.

#### Step 3: Using the best parameters to run the SVC model

In [None]:


svc2 = SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))

### Using Grid Search CV, we have increased the accuracy of SVC from 86% to 90%.

## Using Grid Search CV for Random Forest :


In [None]:
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Build a classification task using 3 informative features


# X, y = make_classification(n_samples=1000,
#                            n_features=10,
#                            n_informative=3,
#                            n_redundant=0,
#                            n_repeated=0,
#                            n_classes=2,
#                            random_state=0,
#                            shuffle=False)


rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 500,700],
    'max_features': ['auto', 'sqrt', 'log2']
    

}

grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
grid_rfc.fit(X, y)



In [None]:
grid_rfc.best_params_

#### Lets use the above best parameters to train our Random Forest Classifier again

In [None]:
X = wineDF.drop(["quality","Reviews"],axis = 1 )
y = wineDF['Reviews']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#X_train = sc.fit_transform(X_train)
#X_test = sc.fit_transform(X_test)


rfc = RandomForestClassifier(n_estimators=700,max_features='auto')
rfc.fit(X_train, y_train)

In [None]:
pred_rfc = rfc.predict(X_test)

In [None]:
pred_rfc

In [None]:
print(classification_report(y_test, pred_rfc))

### Applying Grid Search on Random Forest Algorithm improves the accuracy upto 90% .

In [None]:
# Image("images/Table.png")

## In Conclusion, RFC gives slightly better results when Un-optimized, but gives same results as SVC when both are optimized

# Thank You.