In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

First we will import the data and look at the first few rows and some information about the data.

In [None]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
print(data.shape)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

So, first we will try to gain some insights from the data. 
Here are few of the insights I gained from this dataset - 
1. There are 1599 data points across 12 different columns.
2. There are no missing values. 
3. The column we have to predict, 'quality' has minimum value 3 and maximum value 8. 
We will find out more information about the data with Exploratory Data Analysis

In [None]:
data.columns

# EXPLORATORY DATA ANALYSIS

Exploratory Data Analysis on this dataset becomes a bit difficult because all columns contain continuous data except our target column, which is quality.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Plotting Every Variable against each other

In [None]:
sns.pairplot(data)

In [None]:
plt.figure(figsize= (16,10))
sns.heatmap(data.corr(),cmap = 'Dark2',annot = True,linewidths=1.0,linecolor='black')

In [None]:
data.corr()['quality'].sort_values(ascending=False)

In [None]:
np.abs(data.corr()['quality']).sort_values(ascending=False)

If we look at the heatmap of the correlations closely, we observe :-
1. alcohol has the highest correlation with quality.
2. volatile acidity has the highest negative correlation with quality.
3. sulphates, citric acid are the next highly correlated columns with quality.

First, we will look at the column we have to predict, which is quality.

In [None]:
sns.countplot(data['quality'])

As said before, the quality column consists of classes from 3 to 8. Most of the wine is of quality 5 and 6. 

Now, we will look at the fixed acidity column. 

In [None]:
plt.hist((data['fixed acidity']))

In [None]:
sns.boxplot('quality','fixed acidity',data=data) #you can try sin

In [None]:
sns.regplot(x="fixed acidity", y="quality", data=data)

From the histogram, the boxplot and the difference between the mean and median, we can say that this data is slightly skewed. 
Now, looking at the above regression plot, we can infer that there is a slight trend, which isn't very clear but, as the fixed acidity increases, the quality slightly increases.

Now we will look at volatile acidity

In [None]:
data['volatile acidity'].hist(bins = 30)

In [None]:
sns.boxplot('quality','volatile acidity',data=data)

In [None]:
data["volatile acidity"].describe()

In [None]:
sns.regplot(x="volatile acidity", y="quality", data=data)

The mean and the median are quite close. The boxplot shows the presence of some outliers, but the the statistical summary might allow those outliers to be present . Here, the trend between quality and volatile acidity is ,much more pronounced. And, this fact is backed up by the number -0.390558 , which is the correlation between the columns. They are negatively correlated but there is some relation between them. To be precise, as the volatile acidity increases, quality seems to decrease.   

In [None]:
plt.hist(data['citric acid'])

In [None]:
sns.boxplot('quality','citric acid',data=data)

In [None]:
data['citric acid'].describe()

In [None]:
sns.regplot(x="citric acid", y="quality", data=data)

The mean and the median are quite close. So,there aren't really outliers which could affect the predictions. There is, again, an upward trend but the trend isn't very clear

Now, we will look at the alcohol column. It has the highest correlation

In [None]:
plt.hist(data['alcohol'])

In [None]:
sns.boxplot('quality','alcohol',data=data)

In [None]:
sns.regplot(x="alcohol", y="quality", data=data)

In [None]:
g = sns.FacetGrid(data, col='quality')
g.map(plt.hist, 'alcohol', bins=20)

Looking at the above regression plot, we can infer that there is a clear trend,as the alcohol increases, the quality increases.

We will look at the sulphates column. 

In [None]:
plt.hist(data['sulphates'])

In [None]:
sns.boxplot('quality','sulphates',data=data)

In [None]:
sns.regplot(x="sulphates", y="quality", data=data)

Again, there is a slight trend for sulphates and quality

In [None]:
g = sns.FacetGrid(data, col='quality')
g.map(plt.hist, 'sulphates', bins=20)

Looking at the correlations, we can say that all the colummns with higher correlation. So, we can do more Exploratory Data Analysis, but it would not be of much importance. So, next we will move forward to modelling.

# MODEL CREATION

First we will import all the necessary libraries and functions

In [None]:
from sklearn.linear_model import LogisticRegression   
from sklearn.model_selection import KFold 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB

The following code is a classification model creator, which returns a trained model and its accuracy and cross validation score.

In [None]:
def classification_model(model, data, predictors, outcome):  
    #Fit the model:  
    model.fit(data[predictors],data[outcome])    
    #Make predictions on training set:  
    predictions = model.predict(data[predictors])    
    #Print accuracy  
    accuracy = metrics.accuracy_score(predictions,data[outcome])  
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))
    #Perform k-fold cross-validation with 5 folds  
    kf = KFold(5,shuffle=True)  
    error = []  
    for train, test in kf.split(data):
        # Filter training data    
        train_predictors = (data[predictors].iloc[train,:])        
        # The target we're using to train the algorithm.    
        train_target = data[outcome].iloc[train]        
        # Training the algorithm using the predictors and target.    
        model.fit(train_predictors, train_target)
        #Record error from each cross-validation run    
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
     
    print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))) 
    # %s is placeholder for data from format, next % is used to conert it into percentage
    #.3% is no. of decimals
    return model

classification_model2 is a function, which is a slightly tweaked version of the above classification_model function. It basically trains on a splitted dataset, and then tests on test set.

In [None]:
def classification_model2(model, x_train,p,y_train ):#, outcome):  
    #Fit the model:  
    model.fit(x_train[p],y_train)    
    #Make predictions on training set:  
    predictions = model.predict(x_train[p])    
    #Print accuracy  
    accuracy = metrics.accuracy_score(predictions,y_train)  
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))
    #Perform k-fold cross-validation with 5 folds  
    kf = KFold(5,shuffle=True)  
    error = []  
    for train, test in kf.split(x_train):
        # Filter training data    
        train_predictors = (x_train[p].iloc[train,:])        
        # The target we're using to train the algorithm.    
        train_target = y_train.iloc[train]        
        # Training the algorithm using the predictors and target.    
        model.fit(train_predictors, train_target)
        #Record error from each cross-validation run    
        error.append(model.score(x_train[p].iloc[test,:], y_train.iloc[test]))
     
    print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))) 
    # %s is placeholder for data from format, next % is used to conert it into percentage
    #.3% is no. of decimals
    return model

In [None]:
from sklearn.model_selection import train_test_split
X = (data.iloc[:,0:11])
y = (data['quality'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

First, we will train Logistic Regression models.

In [None]:
s=time.time()
output = 'quality'
predict = ['alcohol','volatile acidity','sulphates','citric acid','residual sugar','pH']
lr = LogisticRegression(max_iter=10000,fit_intercept=False, C=10000)
print("Logistic Regression(1)")
lr = classification_model(lr,data,predict,output)
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
output = 'quality'
predict = ['alcohol','volatile acidity','sulphates','citric acid','residual sugar','pH']
lr2 = LogisticRegression(max_iter=10000,fit_intercept=False, C=10000)
X_train[predict]
print("Logistic Regression with Train Test Split")
lr = classification_model2(lr2,X_train,predict,y_train)
predictions = lr2.predict(X_test[predict])    
    #Print accuracy  
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on test data = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

The following is a K-Nearest Neighbors Model

In [None]:
s=time.time()
output = 'quality'
predict = ['alcohol','volatile acidity']#,'citric acid','sulphates']#,'sulphates']#volatile acidity, sulphates
knn = KNeighborsClassifier(weights='distance', n_neighbors=200)
print("K-Nearest Neighbors(1)")
knn = classification_model(knn,data,predict,output)
print("Time = {}".format(time.time()-s))
# 68.6 , 54.8

In [None]:
s=time.time()
output = 'quality'
predict = ['alcohol','volatile acidity']#,'sulphates','citric acid','residual sugar','pH']
knn2 = KNeighborsClassifier(weights='distance', n_neighbors=150)
print("KNN with Train Test Split")
knn2 = classification_model2(knn2,X_train,predict,y_train)
predictions = knn2.predict(X_test[predict])    
    #Print accuracy  
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on test data = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

Next up, we will create a Decision Tree Classifier

In [None]:
s=time.time()
output = 'quality'
predict = ['volatile acidity','sulphates', 'alcohol']#,'citric acid'] #['alcohol',
dtree = DecisionTreeClassifier(random_state=40,max_depth=20,max_leaf_nodes=100)#,max_features='sqrt')#random_state=40,max_depth=20,max_features='sqrt',max_leaf_nodes=700)
print("DecisionTree")
dtree = classification_model(dtree,data,predict,output)
print("Time = {}".format(time.time()-s))
#99,54

In [None]:
s=time.time()
output = 'quality'
predict = ['alcohol','volatile acidity','sulphates']#'citric acid','residual sugar','pH']
dtree2 = DecisionTreeClassifier(random_state=40)#,max_features='sqrt')
print("Decision Tree 2.")
dtree2 = classification_model2(dtree2,X_train,predict,y_train)
predictions = dtree2.predict(X_test[predict])    
    #Print accuracy  
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on test data = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

We will also train a random forest model

In [None]:
s=time.time()
output = 'quality'
predict = ['volatile acidity','alcohol','sulphates'] #['alcohol',
rf = RandomForestClassifier(n_estimators=10000,max_depth=5,bootstrap=False)
print("Random Forest")
dtree = classification_model(dtree,data,predict,output)
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
output = 'quality'
predict = ['alcohol','sulphates','citric acid', 'volatile acidity']
nb= GaussianNB()
print("Naive Bayes")
nb = classification_model(nb,data,predict,output)
print("Time = {}".format(time.time()-s))

Now, we will try some ensemble learning models.

In [None]:
s=time.time()
estimators = [('lr',lr),('knn',knn),('tree',dtree)]#,('rf',rf)] #,('support',svc)('nb',nb)('tree',dtree),
soft_vote = VotingClassifier(estimators=estimators , voting= 'soft')
print("soft voting ")
soft_vote=classification_model(soft_vote,data,predict,output)
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
predict=["alcohol","sulphates", "volatile acidity"]
estimators = [('lr',lr),('tree',dtree),('knn',knn)]#,('knn2',knn2)]#,('rf',rf)] #,('support',svc)('nb',nb)('tree',dtree),
soft_vote2 = VotingClassifier(estimators=estimators , voting= 'soft')
print("Soft Vote using Train Test Split")
soft_vote2 = classification_model2(soft_vote2,X_train,predict,y_train)
predictions = soft_vote2.predict(X_test[predict])    
    #Print accuracy  
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on test data = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
print("Hard Voting")
hard_vote = VotingClassifier(estimators=estimators , voting= 'hard')
hard_vote = classification_model(hard_vote,data,predict,output)
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
estimators = [('lr',lr),('tree',dtree),('knn',knn)]#,('tree',dtree2)]#,('rf',rf)] #,('support',svc)('nb',nb)('tree',dtree),
hard_vote2 = VotingClassifier(estimators=estimators , voting= 'hard')
print("Hard Voting using Train Test Split")
hard_vote2 = classification_model2(hard_vote2,X_train,predict,y_train)
predictions = hard_vote2.predict(X_test[predict])    
    #Print accuracy  
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on test data = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
print("Stacking using KNN as Meta")
meta = KNeighborsClassifier(weights='distance', n_neighbors=100)
stack = StackingClassifier(estimators = [('lr2',lr2),('knn',knn2),('tree',dtree2),('hard2',hard_vote2)])#('rf',rf)],final_estimator=meta) #('hard',hard_vote),
stack = classification_model2(stack,X_train,predict,y_train) #('hard',hard_vote),
p = stack.predict(X_test[predict])
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on test data = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
meta = LogisticRegression()#max_iter=10000,fit_intercept=False, C=10000)
stack = StackingClassifier(estimators = [('knn',knn),('tree',dtree),('soft',soft_vote),('rf',rf)],final_estimator=meta) #('hard',hard_vote),
stack = classification_model(stack,data,predict,output)#('hard',hard_vote),,
print("Time = {}".format(time.time()-s))

In [None]:
s=time.time()
print("Using Logistic Regression as meta")
meta = LogisticRegression(max_iter = 1000)#max_iter=10000,fit_intercept=False, C=10000)
stack = StackingClassifier(estimators = [('knn2',knn2),('tree',dtree),('soft2',soft_vote2),('hard2',hard_vote2)],final_estimator=meta) #('hard',hard_vote),
stack = classification_model2(stack,X_train,predict,y_train) #('hard',hard_vote),
p = stack.predict(X_test[predict])
accuracy = metrics.accuracy_score(predictions,y_test)
print("Accuracy on Test Set = {}".format(accuracy))
print("Time = {}".format(time.time()-s))

# COMMENTS

In all the models we have trained and tested, we have seen that many of the models tend to overfit the dataset, hence they have  high accuracy but their cross validation score is low. 

When I tried to reduce this difference, the accuracy decreased. The test set accuracy didn't reach 0.7 . 

Can anyone suggest any other way to increase the cross validation score and the accuracy on the test set?