# Objectives: 

#### The objectives of this project are as follows:

#### 1. To experiment with different classification methods to see which yields the highest accuracy
#### 2. To determine which features are the most indicative for predicting the quality of  wine

#   

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

sns.set_style('darkgrid')

#  

# Data Collection

In [None]:
raw_data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
raw_data.head()

#  

# Data description

In [None]:
data = raw_data.copy()
data.info()

In [None]:
data.describe()

In [None]:
data.columns.values

#   

# Data Cleaning

### 1. Checking missing values

In [None]:
data.isnull().sum()

So there are no missing values

### 2.  Outlier Detection

In [None]:
plt.figure(figsize = (25,15))
sns.boxplot(data = pd.melt(data) , x = 'variable', y = 'value')
plt.show()

#### So there are some outliers in the 'total sulfer dioxide' column

In [None]:
plt.figure(figsize = (8,6))
sns.distplot(data['total sulfur dioxide'])
plt.show()

#  

### Removing outliers from data :

In [None]:
data = data[data['total sulfur dioxide']<180]
plt.figure(figsize = (8,6))
sns.distplot(data['total sulfur dioxide'])
plt.show()

#### Checking the data again with a boxplot

In [None]:
plt.figure(figsize = (15,15))
sns.boxplot(data = pd.melt(data) , x = 'variable', y = 'value')
plt.show()

### Checking the distribution of quality

In [None]:
plt.figure(figsize = (8,6))
sns.distplot(data['quality'])
plt.show()

#  

# Data Exploration

#### Let's first check which features are correlated with each other with a correclation heatmap

In [None]:
plt.figure(figsize =(12,12))
sns.heatmap(data.corr(), cmap = 'Blues', annot = True)
plt.show()

#### The density and citric acid are highly correlated with fixed acidity. Again, total sulfur dioxide and free sulfer dioxide are highly correlated to each other. 

This is the multicollinearity. This results in unstable parameter estimates of regression which makes it very difficult to assess the effect of independent variables on dependent variables.

We will use Variance inflation factor to analyze which variable has a high correlation.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = data[['density', 'citric acid', 'total sulfur dioxide', 'free sulfur dioxide']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns
vif

'free sulfur dioxide' has a correlation above 5 and needs to be dropped

In [None]:
data = data.drop('free sulfur dioxide', axis = 1)
data

##  

Let's check the Variance inflation factor for the remaining 3 variables after dropping the 'free sulfur dioxide' column.

In [None]:
variables = data[['density', 'citric acid', 'total sulfur dioxide']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns
vif

#### As the vif of these variables are below 5, there's no multicolinearity.

#  


Now, let's check which features are most important for our quality predictions. For this we will use Extra Tree classifier,

In [None]:
from sklearn.ensemble import ExtraTreesClassifier 

X = data.drop('quality', axis = 1)
Y = data['quality']

model =  ExtraTreesClassifier()
model.fit(X,Y)

features = pd.DataFrame()
features['Features'] = X.columns
features['Importance'] = model.feature_importances_

plt.figure(figsize =(15,6))
sns.barplot(y='Importance', x='Features', data=features,  order=features.sort_values('Importance',ascending = False).Features)
plt.xlabel("Features", size=15)
plt.ylabel("Importance", size=15)
plt.title("Features Importance(Descending order)", size=18)
plt.tight_layout()

#### We can observe that except the pH column , all other features comprises 95% of the data that influences a wine quality. So we are going to use these 9 features for our models.

In [None]:
data_new = data.drop('pH', axis = 1)
data_new.shape

#### Since the features are measured in different units , wee need to standardize the values. for that, we use Standard Scaler. 
##### Standard scaler scales the values with mean = 0 and standard deviation = 1.

In [None]:
from sklearn.preprocessing import StandardScaler

x = data_new.drop('quality', axis = 1)
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)


In [None]:
x_scaled.shape

Let's observe the distribution of wine quality in the data 

In [None]:
plt.figure(figsize =(16,5))
plt.subplot(1,2,1)
sns.distplot(data['quality'])
plt.subplot(1,2,2)
sns.countplot(data['quality'])#Showing the frequency of occurence of a particular quality rating
plt.show()

#### Now we will categorize the ratings into 3 categories , 'Bad', 'Normal' , 'Good'.

In [None]:
category = [] # Defining an empty array
for x in data['quality']:
    if x>=1 and x<=3:
        category.append('Bad')
    elif x>=4 and x<=6:
        category.append('Normal')
    elif x>=7 and x<=10:
        category.append('Good')
        
        
data_new['category'] = category #Assigning a new column
data_new.head()

In [None]:
data_final = data_new.copy()
data_final = data_final.drop('quality',axis =1)
data_final.head()

In [None]:
data_final['category'].value_counts() #Checking the number of ratings in each category

##  

# Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

#defining inputs(independent) and targets(dependent) variables
inputs = x_scaled
targets = data_final['category']

#splitting into training and testing data

x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size = 0.2, random_state = 42)

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

#  

# Models

#### Since this a Classification problem , we are mainly going to use :

#### 1. Logistic Regression

#### 2. Decision Tree Classifier

#### 3. Support Vector Classifier

#### 4. Random Forest Classifier

#### 5. K-Nearest Neighbours


#  

In [None]:
#Defining a method or function that will print the cross validation score and accuracy for each model

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

def model_report(cl):
    
    cl.fit(x_train, y_train)

    print('Cross Val Score: ',(cross_val_score(cl,x_train,y_train, cv=5).mean()*100).round(2))#using a 5-Fold cross validation

    y_pred = cl.predict(x_test)

    print('Accuracy Score: ', (accuracy_score(y_test,y_pred)*100).round(2))

#  

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

model_report(lr)

#  

## 2. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

model_report(dt)

#  

## 3. Support Vector Classifer

In [None]:
from sklearn.svm import SVC

svc = SVC()

model_report(svc)

#  

## 4. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

model_report(rf)

##  

## 5. K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(algorithm ='auto')

model_report(kn)

#  

## Hyper Parameter Tuning 

#### Lets try to tune our models and see if we can improve accuracy. For this we will use GridSearchCV 

Grid search is the process of performing hyper parameter tuning in order to determine the optimal values for a given model. This is significant as the performance of the entire model is based on the hyper parameter values specified.

In [None]:
from sklearn.model_selection import GridSearchCV

#Defining a function that will calculate the best parameters and accuracy of the model based on those parameters
#Using GridSearchCV

def grid_search(classifier,parameters):
    
    grid = GridSearchCV(estimator = classifier,
                        param_grid = parameters,
                        scoring = 'accuracy',
                        cv = 5,
                        n_jobs = -1
                        )
    
    grid.fit(x_train,y_train)

    print('Best parameters: ', grid.best_params_) #Displaying the best parameters of the model

    print("Accuracy: ", ((grid.best_score_)*100).round(2))#Accuracy of the model based on those parameters

#  

## 1. Support Vector Classifier

In [None]:
param_svc = {
    'C': [0.1, 1, 10, 100],  
    'gamma': [0.0001, 0.001, 0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9], 
    'kernel': ['linear','rbf']
    }
svc = SVC()

grid_search(svc,param_svc)

In [None]:
#Training the model again with the best parameters we got
svc = SVC(C = 10, gamma = 0.3, kernel='rbf')

model_report(svc)

##### So, the accuracy of our Support Vector Classifier model increased from 89.38% to 90.62% 

#  

## 2. Random Forest 

In [None]:
param_rf = {
    'n_estimators': [10,50,100,500,1000],
    'min_samples_leaf': [1,10,20,50]
    }
rf = RandomForestClassifier(random_state = 0)
grid_search(rf,param_rf)

In [None]:
#Training the model again with the best parameters we got
rf = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 1,random_state = 0)
model_report(rf)

##### So, the accuracy of our Random Forest Classifier model increased from 91.25% to 92.19%

#  

## 3. K-Nearest Neighbors

In [None]:
n_neighbors = list(range(5,10))#This is basically the value of k
                   
param_knn = {
    'n_neighbors' : n_neighbors,
    'p' : [1,2]
    
    }

knn = KNeighborsClassifier(algorithm ='auto', n_jobs = -1)
grid_search(knn,param_knn)

In [None]:
#Training the model again with the best parameters we got
knn = KNeighborsClassifier(n_neighbors = 7, p = 2, algorithm ='auto', n_jobs = -1)
model_report(knn)

##### Looks like hyper parameter tuning did'nt changed the accuracy of K-Nearest Neighbors

#  

#### Lastly we will use some boosting algorithms mainly :
#### 1. AdaBoost
#### 2. Gradient Boost
#### 3. XGBoost

#  

## 1. AdaBoost

In [None]:
from sklearn.ensemble import  AdaBoostClassifier

ab = AdaBoostClassifier(random_state = 42)

model_report(ab)

##### So, AdaBoost is'nt a good model to perform on this dataset 

#  

## 2. Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state = 42, learning_rate = 0.2)

model_report(gb)

##### Gradient boost gives quite better accuracy than AdaBoost.

#  

## 3. XGBoost

In [None]:
from xgboost import XGBClassifier

xg = XGBClassifier(random_state = 42, learning_rate = 0.2)

model_report(xg)

### So , comparing all the models , Random Forest(92.19% accuracy) and XGBoost(92.81%) seems to give the highest accuracy.


##    

Finally , let's see which features contributed most in each of these 2 models.

## Random Forest:

In [None]:
features_rf = pd.DataFrame()
x_rf = data_final.drop('category',axis=1)
features_rf['Features'] = x_rf.columns
features_rf['Importance'] = rf.feature_importances_

plt.figure(figsize =(15,6))
sns.barplot(y='Importance', x='Features', data=features_rf,  order=features_rf.sort_values('Importance',ascending = False).Features)
plt.xlabel("Features", size=15)
plt.ylabel("Importance", size=15)
plt.title("Features Importance(Descending order) for Random Forest", size=18)
plt.tight_layout()

##  

## XGBoost:

In [None]:
features_xg = pd.DataFrame()
x_xg = data_final.drop('category',axis=1)
features_xg['Features'] = x_xg.columns
features_xg['Importance'] = xg.feature_importances_

plt.figure(figsize =(15,6))
sns.barplot(y='Importance', x='Features', data=features_xg,  order=features_xg.sort_values('Importance',ascending = False).Features)
plt.xlabel("Features", size=15)
plt.ylabel("Importance", size=15)
plt.title("Features Importance(Descending order) for XGBoost", size=18)
plt.tight_layout()

#  

# Conclusuion:

##### 1. For determing the quality of wines , alcohol plays a significant role followed by sulphates and volatile acididty.
##### 2. For predicting quality of wines, we can either use Random Forest or XGBoost model. However,  XGBoost has a slightly better accuracy(0.62% more accurate) over Random Forest.

#  

#### If you find this notebook useful, please upvote it! And let me know in the comments if i did anything wrong or if i could've done this better. Means a lot! :)