In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import the libraries
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

%matplotlib inline
from sklearn.linear_model import LogisticRegression


In [None]:
wine_raw_data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
wine_raw_data.head()

# The given data set contains only the numerical features .

In [None]:
# for checkpoint purpose, we'll copy the wine_raw_data into a new dataset.
wine_data = wine_raw_data.copy()

In [None]:
# information from the dataset
wine_data.info()

In [None]:
# the dataset does not contain any null values. Let's describe the dataset for further understanding.
wine_data.describe()

In [None]:
# store the dependent and independent variables into seperate variables
x = wine_data.iloc[:,:-1].values
y = wine_data.iloc[:,-1].values
print(x)
print("-----")
print(y)

In [None]:
# unique values in the depenedent feature.
wine_data['quality'].unique()

In [None]:
# Let's plot the dependent variable for the counts of each value.
sb.countplot(x= 'quality',data = wine_data)
print(wine_data['quality'].value_counts().sort_index())

In [None]:
# From the above observation :
# most of the data is present for the quality with rating : '5' and rating :'6'
# Since most of the data in the dependent variable is concentrated between 5 and 6 , lets just apply the hit and trial error.
#hence any value less than <=5.5 will be considered bad(0) and those above 5.5 will be considered good(1).

# For checkpoint purpose, we'll copy the wine_data into new variable

wine_copy_data = wine_data.copy()

In [None]:
wine_quality = []
for i in wine_copy_data['quality']:
    if i >=0 and i<=5:
        wine_quality.append('0')
    elif i>5 and i<=10:
        wine_quality.append('1')
wine_copy_data['wine_quality'] = wine_quality

In [None]:
wine_copy_data.columns

In [None]:
wine_copy_data.head()

In [None]:
wine_copy_data.drop(['quality'],axis = 1)

In [None]:
# heatmap to find the correlation between the independent and dependent features.
plt.figure(figsize = (15,10))
sb.heatmap(wine_copy_data.corr() , annot=True)

In [None]:
corr = wine_copy_data.corr()
corr['quality'].sort_values(ascending = False)

In [None]:
# From the above observation , we can see that the alcohol is largely related to quality and the volatile acidity is the least related to quality

# Feature Scaling : the features contains values from all ranges, hence we need to scale the data for processing

In [None]:
.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_new = wine_copy_data.iloc[:,:-1].values
y_new = wine_copy_data.iloc[:,-1].values
x_scale = sc.fit_transform(x_new)
print(x_new)
print("------")
print(x_scale)

# Train and test Data

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_scale,y_new,train_size = 0.8,random_state = 42)
x_test = sc.fit_transform(x_test)

# Our Model is now ready and several algorithms can be performed on it:
    1) Logistic Regression
    2) SVM Clasifier
    3) Random Forest Classifier
    4) Decision Tree Classifier.

1) Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_predict = lr.predict(x_test)

In [None]:
#print confusion matrix and accuracy score
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print(lr_conf_matrix)
print(lr_acc_score*100)

2) SVM Classifier: 

In [None]:
from sklearn.svm import SVC
#we shall use the rbf kernel first and check the accuracy
svm = SVC()
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)

In [None]:
#print confusion matrix and accuracy score
svm_conf_matrix = confusion_matrix(y_test, svm_pred)
svm_acc_score = accuracy_score(y_test, svm_pred)
print(svm_conf_matrix)
print(svm_acc_score*100)

3) Decision Tree Classifier:

In [None]:
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train,y_train)
decisionTree_pred = decisionTree.predict(x_test)

In [None]:
#print confusion matrix and accuracy score
decisionTree_conf_matrix = confusion_matrix(y_test, decisionTree_pred)
decisionTree_acc_score = accuracy_score(y_test, decisionTree_pred)
print(decisionTree_conf_matrix)
print(decisionTree_acc_score*100)