In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
# Analyzing Quantitative variables.
plt.figure(figsize=(30, 30))
sns.set(font_scale=1.5)
ind = 1

for col in df.columns:
    plt.subplot(4, 3, ind)
    sns.boxplot(x=df[col])
    ind += 1

In [None]:
sns.countplot(x=df.quality)

In [None]:
y=df.quality.replace({3:0, 4:0, 5:0, 6:0, 7:1, 8:1})

In [None]:
y.value_counts()

In [None]:
fig, axes = plt.subplots(4, 3, figsize = (15,15))
axes = axes.flatten()

for i in range(0,len(df.columns)-1):
    sns.barplot(x=y, y=df.iloc[:,i], data=df, orient='v', ax=axes[i])

plt.tight_layout()
plt.show()

In [None]:
df.quality=df.quality.replace({3:0, 4:0, 5:0, 6:0, 7:1, 8:1})

In [None]:
sns.pairplot(data=df, hue='quality')

In [None]:
# Data Pre-processing with different normalization options.
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

def data_preprocess(X,y,std_scale=False,minmax_scale=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    if std_scale or minmax_scale:
        if std_scale:
            scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        else:
            scaler = MinMaxScaler(copy=True,feature_range=(0,1))
            
        scaler.fit(X_train)

        train_scaled = scaler.transform(X_train)
        test_scaled = scaler.transform(X_test)
    else:
        train_scaled, test_scaled = X_train, X_test
    
    return(train_scaled, test_scaled, y_train, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression

def logistic_regression(X_train,X_test,y_train,y_test,cls_weight=None):
    logreg = LogisticRegression(class_weight=cls_weight).fit(X_train, y_train)
    print("Training set score: {:.3f}".format(logreg.score(X_train,y_train)))
    print("Test set score: {:.3f}".format(logreg.score(X_test,y_test)))
    return(logreg)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, matthews_corrcoef

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}")
        print("_______________________________________________")
        print(f"MCC: {matthews_corrcoef(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}")
        print("_______________________________________________")
        print(f"MCC: {matthews_corrcoef(y_test, pred)}")

In [None]:
# Heatmap
plt.figure(figsize=(len(df.columns), len(df.columns)-7))
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn", annot_kws={"size":15})

# Logistic regression

In [None]:
X=df.drop('quality',axis=1)

In [None]:
X_train,X_test,y_train,y_test=data_preprocess(X,y,std_scale=True)

In [None]:
log_reg=logistic_regression(X_train,X_test,y_train,y_test)

In [None]:
logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
result.summary2()

# Logistic Regression - Iteration 2

In [None]:
X=df.drop(labels=['citric acid','chlorides','free sulfur dioxide', 'total sulfur dioxide','pH','quality'],axis=1)

In [None]:
X_train,X_test,y_train,y_test=data_preprocess(X,y,std_scale=True)

In [None]:
log_reg=logistic_regression(X_train,X_test,y_train,y_test)

In [None]:
logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
result.summary2()

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
print_score(log_reg,X_train,y_train,X_test,y_test,train=True)
print_score(log_reg,X_train,y_train,X_test,y_test,train=False)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# predict probabilities
lr_probs = log_reg.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
lr_auc = roc_auc_score(y_test, lr_probs)
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

# DT

In [None]:
X=df.drop('quality',axis=1)

In [None]:
X_train,X_test,y_train,y_test=data_preprocess(X,y)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state=0)
tree_clf.fit(X_train, y_train)

print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

In [None]:
path = tree_clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [tree_clf.tree_.node_count for tree_clf in clfs]
depth = [tree_clf.tree_.max_depth for tree_clf in clfs]
fig, ax = plt.subplots(1, 2,figsize=(20,8))
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")

ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")

In [None]:
train_scores = [tree_clf.score(X_train, y_train) for tree_clf in clfs]
test_scores = [tree_clf.score(X_test, y_test) for tree_clf in clfs]

fig, ax = plt.subplots(figsize=(15,5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
tree_clf = DecisionTreeClassifier(random_state=0,ccp_alpha=0.006)
tree_clf.fit(X_train, y_train)

In [None]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

In [None]:
# Feature Importance.
feat_importances = pd.Series(tree_clf.feature_importances_,index=X.columns)
feat_importances[feat_importances.values>0].sort_values(ascending=False).plot(kind='bar')
plt.show()

print(feat_importances[feat_importances.values>0].sort_values(ascending=False))

In [None]:
tree_clf = DecisionTreeClassifier(random_state=0,ccp_alpha=0.006)
tree_clf.fit(X_train[['volatile acidity','sulphates','alcohol']], y_train)

In [None]:
print_score(tree_clf, X_train[['volatile acidity','sulphates','alcohol']], y_train, 
            X_test[['volatile acidity','sulphates','alcohol']], y_test, train=True)
print_score(tree_clf, X_train[['volatile acidity','sulphates','alcohol']], y_train, 
            X_test[['volatile acidity','sulphates','alcohol']], y_test, train=False)

In [None]:
tree_clf.predict([np.array([0.88, 0.56, 9.4])])[0]

# Decision tree with post pruning gave the best possible result