In [19]:
from __future__ import print_function

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, recall_score, precision_score

In [2]:
red_wine = pd.read_csv('winequality-red.csv',sep=";")
red_wine['type']="red wine"
white_wine = pd.read_csv('winequality-white.csv',sep=";")
white_wine['type']="white wine"

In [3]:
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [4]:
from sklearn.model_selection import train_test_split

both_wines = pd.concat([red_wine,white_wine],axis=0)

df2, targets = encode_target(both_wines, "type")
print("* df2.head()", df2[["Target", "type"]].head(),
      sep="\n", end="\n\n")
print("* df2.tail()", df2[["Target", "type"]].tail(),
      sep="\n", end="\n\n")
print("* targets", targets, sep="\n", end="\n\n")

* df2.head()
   Target      type
0       0  red wine
1       0  red wine
2       0  red wine
3       0  red wine
4       0  red wine

* df2.tail()
      Target        type
4893       1  white wine
4894       1  white wine
4895       1  white wine
4896       1  white wine
4897       1  white wine

* targets
['red wine' 'white wine']



In [5]:
df_shuffled = df2.iloc[np.random.permutation(len(df2))]
df2 = df_shuffled.reset_index(drop=True)

In [14]:
from sklearn.preprocessing import StandardScaler
features = list(df2_hq.columns[:12])
print("* features:", features, sep="\n")

y = df2_hq["Target"]
X = df2_hq[features]
X_std = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=.3,random_state=10)

* features:
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [15]:
def con_rec_pre(y_true,y_pred):
    """Calculates and prints the confusion matrix, the recall and the precision score

    Args
    ----
    y_true -- 1d array-like, or label indicator array / sparse matrix
    y_pred -- 1d array-like, or label indicator array / sparse matrix

    Returns
    -------
    test_conf_mat -- confusion matrix
    test_recall -- recall score
    test_precision -- precision score
    """
    test_conf_mat = confusion_matrix(y_true,y_pred)
    test_recall = recall_score(y_true,y_pred)
    test_precision = precision_score(y_true,y_pred)
    print("* Confusion matrix:", test_conf_mat,sep="\n", end="\n\n")
    print("* Recall:", test_recall,sep="\n", end="\n\n")
    print("* Precision:", test_precision,sep="\n", end="\n\n")

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=10)
dt.fit(X_train,y_train)

dt_y_train_pred = dt.predict(X_train)
dt_y_test_pred = dt.predict(X_test)

In [20]:
con_rec_pre(dt_y_train_pred,y_train)

* Confusion matrix:
[[ 12   0]
 [  0 126]]

* Recall:
1.0

* Precision:
1.0



In [21]:
con_rec_pre(dt_y_test_pred,y_test)

* Confusion matrix:
[[ 4  0]
 [ 2 54]]

* Recall:
0.964285714286

* Precision:
1.0

