# Ensemble
In this notebook we try a standard ensemble approach with K-fold CV.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

In [4]:
#data = pd.read_csv('data/red_normal.csv')
#data = pd.read_csv('data/red_data.csv')
#data = pd.read_csv('data/white_normal.csv')
#data = pd.read_csv('data/white_data.csv')
#data = pd.read_csv('data/wine_normal.csv')
data = pd.read_csv('data/wine_data.csv')
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,quality
0,6.3,0.27,0.25,5.8,0.038,52.0,155.0,0.995,3.28,0.38,9.4,0,6
1,7.0,0.31,0.26,7.4,0.069,28.0,160.0,0.9954,3.13,0.46,9.8,0,6
2,7.6,0.38,0.2,3.4,0.046,9.0,116.0,0.9944,3.15,0.41,9.4,0,5
3,8.5,0.28,0.35,1.7,0.061,6.0,15.0,0.99524,3.3,0.74,11.8,1,7
4,8.0,0.725,0.24,2.8,0.083,10.0,62.0,0.99685,3.35,0.56,10.0,1,6


In [5]:
X, y = data.ix[:,:-1], data['quality']

In [6]:
# We leave out 20% as a test set and the remaining 80% will be our training set.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_train, y_train = X_train.reset_index(drop=True), y_train.reset_index(drop=True)

In [7]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)
kf.get_n_splits(X_train)

5

In [8]:
def selectBestParameters(X_tr, X_ts, y_tr, y_ts):
    penalties = ['l1', 'l2']
    C = [0.001*10**i for i in range(6)]
    maes = list()
    params = list()
    for p in penalties:
        for c in C:
            params.append([p, c])
            clf = LogisticRegression(penalty=p, C=c, n_jobs=-1)
            clf.fit(X_tr, y_tr)
            y_pred = clf.predict(X_ts)
            mae = mean_absolute_error(y_ts, y_pred)
            maes.append(mae)
    arg_min = np.argmin(maes)
    return params[arg_min]

In [9]:
clfs = []
for i, (train_index, test_index) in enumerate(kf.split(X_train)):
    print("Training split number {} ...".format(i+1))
    X_tr, X_ts = X_train.loc[train_index], X_train.loc[test_index]
    y_tr, y_ts = y_train.loc[train_index], y_train.loc[test_index]
    penalty, C = selectBestParameters(X_tr, X_ts, y_tr, y_ts)
    clf = LogisticRegression(penalty=penalty, C=C)
    clf.fit(X_tr, y_tr)
    clfs.append(clf)
    y_pred = clf.predict(X_ts)
    print("MAE =", mean_absolute_error(y_ts, y_pred))
    print("Acc. =", accuracy_score(y_ts, y_pred))
    print()

Training split number 1 ...
MAE = 0.536538461538
Acc. = 0.525

Training split number 2 ...
MAE = 0.514423076923
Acc. = 0.541346153846

Training split number 3 ...
MAE = 0.520692974013
Acc. = 0.531280076997

Training split number 4 ...
MAE = 0.517805582291
Acc. = 0.543792107796

Training split number 5 ...
MAE = 0.525505293551
Acc. = 0.533205004812



In [10]:
def generateFeatures(train):
    for idx, clf in enumerate(clfs):
        name = 'clf_' + str(idx)
        y_pred = clf.predict(train.ix[:,:12])
        position = train.shape[1] - 1
        train.insert(position, name, y_pred)
generateFeatures(X_train)
generateFeatures(X_test)

In [11]:
"""clf1 = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf1.fit(X_train.ix[:,:11], y_train)
y_pred1 = clf1.predict(X_test.ix[:,:11])
clf2 = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf2.fit(X_train.ix[:,11:], y_train)
y_pred2 = clf2.predict(X_test.ix[:,11:])
y_pred = (y_pred1*y_pred2)**0.5
y_pred = pd.Series(y_pred).apply(np.rint)"""
features = ['alcohol', 'volatile_acidity'] 
features += ['clf_' + str(i) for i in range(n_splits)]
clf = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf.fit(X_train[features], y_train)
y_pred = clf.predict(X_test[features])
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.520769230769
Acc. = 0.549230769231


In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.344615384615
Acc. = 0.696923076923


In [14]:
important = []
for name, imp in zip(X_train.columns,clf.feature_importances_):
    important.append((imp, name))
for imp, name in sorted(important)[::-1]:
    print("{:<18} {}".format(name, imp))

alcohol            0.08744958921880071
density            0.08724093612277391
total_SD           0.08643291576783323
volatile_acidity   0.08563921113118342
free_SD            0.08410553756900786
pH                 0.08307755172540497
sugar              0.08239138973495075
chlorides          0.0804824382608191
sulphates          0.07941118985587754
citric             0.07500664628055985
fixed_acidity      0.07391349750398445
clf_1              0.029830024932010563
clf_3              0.024813631649719302
clf_0              0.02344648340981929
clf_2              0.008677472851043245
clf_4              0.004585317497194166
type_red           0.0034961664890176603


In [15]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')
clf.fit(X_train.ix[:,:11], y_train)
y_pred = clf.predict(X_test.ix[:,:11])
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.347692307692
Acc. = 0.696923076923


In [16]:
important = []
for name, imp in zip(X_train.columns,clf.feature_importances_):
    important.append((imp, name))
for imp, name in sorted(important)[::-1]:
    print("{:<18} {}".format(name, imp))

alcohol            0.12456150169915467
volatile_acidity   0.10079086524084184
density            0.10037199210137956
total_SD           0.09056896930277966
free_SD            0.08735394626657877
chlorides          0.08717254140126988
sugar              0.08628963087932363
sulphates          0.08513512467085266
pH                 0.08451372034909267
citric             0.07726568780858502
fixed_acidity      0.07597602028014176
