# Ensemble
In this notebook we try a standard ensemble approach with K-fold CV.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

In [4]:
#data = pd.read_csv('data/red_normal.csv')
#data = pd.read_csv('data/red_data.csv')
#data = pd.read_csv('data/white_normal.csv')
#data = pd.read_csv('data/white_data.csv')
data = pd.read_csv('data/wine_normal.csv')
#data = pd.read_csv('data/wine_data.csv')
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,quality
0,-1.564967,-2.115822,-0.613053,-0.412341,-0.381414,-1.025732,-0.973236,0.248294,1.137303,-0.261526,-1.067971,0,5
1,-0.987675,1.452634,-1.288368,0.404591,-0.391592,0.402953,-0.159545,-0.66055,1.741648,1.367797,1.761353,1,7
2,-0.77273,1.646757,-0.613053,-0.392625,1.147534,1.33543,1.591586,-0.009153,0.541278,0.264209,-0.580412,0,4
3,-0.645329,-0.550403,-0.282528,-0.865805,0.433714,-0.578715,-0.432035,-0.664473,0.342604,-0.173903,-0.092853,0,6
4,-0.298454,-0.583142,2.099702,-0.898276,5.590264,0.489629,0.899821,0.293176,-1.496978,3.845571,-1.053784,1,5


In [5]:
# Optionally generate interaction features x*y and x**2
if True:
    from itertools import combinations
    old_features = [name for name in data.columns if not 'type' in name and name != "quality"]
    for feature1, feature2 in combinations(old_features, 2):
        temp = data[feature1] * data[feature2]
        data.insert(data.shape[1]-1, feature1 + 'x' + feature2, temp)

    for name in old_features:
         data.insert(data.shape[1]-1, name + '**2', data[name]*data[name])   

In [6]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,...,total_SDxpH,total_SDxsulphates,total_SDxalcohol,densityxpH,densityxsulphates,densityxalcohol,pHxsulphates,pHxalcohol,sulphatesxalcohol,quality
0,-1.564967,-2.115822,-0.613053,-0.412341,-0.381414,-1.025732,-0.973236,0.248294,1.137303,-0.261526,...,-1.106864,0.254527,1.039388,0.282385,-0.064935,-0.265171,-0.297434,-1.214606,0.279302,5
1,-0.987675,1.452634,-1.288368,0.404591,-0.391592,0.402953,-0.159545,-0.66055,1.741648,1.367797,...,-0.27787,-0.218225,-0.281014,-1.150446,-0.903499,-1.163462,2.38222,3.067656,2.409174,7
2,-0.77273,1.646757,-0.613053,-0.392625,1.147534,1.33543,1.591586,-0.009153,0.541278,0.264209,...,0.861491,0.420512,-0.923776,-0.004954,-0.002418,0.005313,0.143011,-0.314165,-0.15335,4
3,-0.645329,-0.550403,-0.282528,-0.865805,0.433714,-0.578715,-0.432035,-0.664473,0.342604,-0.173903,...,-0.148017,0.075132,0.040116,-0.227651,0.115554,0.061698,-0.05958,-0.031812,0.016147,6
4,-0.298454,-0.583142,2.099702,-0.898276,5.590264,0.489629,0.899821,0.293176,-1.496978,3.845571,...,-1.347012,3.460325,-0.948216,-0.438878,1.127429,-0.308944,-5.756737,1.577491,-4.0524,5


In [7]:
# number of features
p = data.shape[1] - 1
p

67

In [8]:
X, y = data.ix[:,:-1], data['quality']

In [9]:
# We leave out 20% as a test set and the remaining 80% will be our training set.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_train, y_train = X_train.reset_index(drop=True), y_train.reset_index(drop=True)

In [10]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)
kf.get_n_splits(X_train)

5

In [11]:
def selectBestParameters(X_tr, X_ts, y_tr, y_ts):
    penalties = ['l1', 'l2']
    C = [0.001*10**i for i in range(6)]
    maes = list()
    params = list()
    for p in penalties:
        for c in C:
            params.append([p, c])
            clf = LogisticRegression(penalty=p, C=c, n_jobs=-1)
            clf.fit(X_tr, y_tr)
            y_pred = clf.predict(X_ts)
            mae = mean_absolute_error(y_ts, y_pred)
            maes.append(mae)
    arg_min = np.argmin(maes)
    return params[arg_min]

In [12]:
clfs = []
for i, (train_index, test_index) in enumerate(kf.split(X_train)):
    print("Training split number {} ...".format(i+1))
    X_tr, X_ts = X_train.loc[train_index], X_train.loc[test_index]
    y_tr, y_ts = y_train.loc[train_index], y_train.loc[test_index]
    penalty, C = selectBestParameters(X_tr, X_ts, y_tr, y_ts)
    clf = LogisticRegression(penalty=penalty, C=C)
    clf.fit(X_tr, y_tr)
    clfs.append(clf)
    y_pred = clf.predict(X_ts)
    print("MAE =", mean_absolute_error(y_ts, y_pred))
    print("Acc. =", accuracy_score(y_ts, y_pred))
    print()

Training split number 1 ...
MAE = 0.488461538462
Acc. = 0.553846153846

Training split number 2 ...
MAE = 0.450961538462
Acc. = 0.589423076923

Training split number 3 ...
MAE = 0.504331087584
Acc. = 0.543792107796

Training split number 4 ...
MAE = 0.484119345525
Acc. = 0.566891241578

Training split number 5 ...
MAE = 0.527430221367
Acc. = 0.533205004812



In [13]:
def generateFeatures(train):
    for idx, clf in enumerate(clfs):
        name = 'clf_' + str(idx)
        y_pred = clf.predict(train.ix[:,:p])
        position = train.shape[1] - 1
        train.insert(position, name, y_pred)
generateFeatures(X_train)
generateFeatures(X_test)

In [14]:
"""clf1 = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf1.fit(X_train.ix[:,:11], y_train)
y_pred1 = clf1.predict(X_test.ix[:,:11])
clf2 = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf2.fit(X_train.ix[:,11:], y_train)
y_pred2 = clf2.predict(X_test.ix[:,11:])
y_pred = (y_pred1*y_pred2)**0.5
y_pred = pd.Series(y_pred).apply(np.rint)"""

clf = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.534615384615
Acc. = 0.537692307692


## Feature importance using random forests

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.383846153846
Acc. = 0.667692307692


In [17]:
# Importance of features according to the random forest
important = []
for name, imp in zip(X_train.columns,clf.feature_importances_):
    important.append((imp, name))
for imp, name in sorted(important)[::-1]:
    print("{:<18} {}".format(name, imp))

clf_2              0.028414555060923063
clf_1              0.02586225133704547
alcohol            0.024790225398306918
volatile_acidityxalcohol 0.01972515705685514
clf_0              0.019391392057707856
clf_4              0.01733818798835115
density            0.015959963467343203
volatile_acidity   0.01538372352936672
chloridesxalcohol  0.014922378232329623
sulphatesxalcohol  0.014904424506633769
volatile_acidityxtotal_SD 0.014707447966871321
volatile_acidityxchlorides 0.014430938920663897
volatile_acidityxsulphates 0.014392412007502898
volatile_acidityxdensity 0.014296970932726992
densityxalcohol    0.014283560098527716
chlorides          0.014198479447285337
free_SDxtotal_SD   0.014093703160169457
volatile_acidityxpH 0.013856693340338212
clf_3              0.013848837856761154
free_SDxsulphates  0.013723452437187469
citricxalcohol     0.013641701740309625
volatile_acidityxfree_SD 0.013633786535493818
fixed_acidityxtotal_SD 0.013608173002715516
chloridesxpH       0.01359938316985352

In [18]:
# Not using the extra classifiers obtained above
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=25,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features='auto')
clf.fit(X_train.ix[:,:11], y_train)
y_pred = clf.predict(X_test.ix[:,:11])
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.386923076923
Acc. = 0.670769230769


In [19]:
# Importance of features according to the random forest
important = []
for name, imp in zip(X_train.columns,clf.feature_importances_):
    important.append((imp, name))
for imp, name in sorted(important)[::-1]:
    print("{:<18} {}".format(name, imp))

alcohol            0.12752848193471442
volatile_acidity   0.10366217012061799
density            0.10100621344383716
total_SD           0.09178807473701675
free_SD            0.08666180455037648
chlorides          0.0862586954535185
sugar              0.08362177100902778
sulphates          0.0832671276624536
pH                 0.08202970567998959
citric             0.07947800092384905
fixed_acidity      0.07469795448459868


## Using only one logistic regression classifier

In [20]:
X_train.iloc[:,:-5]

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,...,total_SDxdensity,total_SDxpH,total_SDxsulphates,total_SDxalcohol,densityxpH,densityxsulphates,densityxalcohol,pHxsulphates,pHxalcohol,clf_0
0,-0.643065,-1.091130,-0.159011,-0.447265,-0.432743,1.274425,0.466634,-0.522790,0.834832,-0.461036,...,-0.243952,0.389561,-0.215135,-0.141428,-0.436442,0.241025,0.158447,-0.384888,-0.253021,6
1,0.563469,3.669610,-2.017781,0.987486,0.049269,0.685891,1.544525,1.037352,-0.253420,-0.261526,...,1.602217,-0.391414,-0.403934,-1.900524,-0.262886,-0.271295,-1.276452,0.066276,0.311831,5
2,-0.148033,-1.411915,-0.117266,1.204360,-0.617855,-1.025732,0.415062,0.880209,-2.041493,-1.137752,...,0.365342,-0.847347,-0.472238,-0.510730,-1.796941,-1.001460,-1.083089,2.322712,2.512038,7
3,-0.643065,0.732989,-1.391037,-0.590653,0.503620,0.214135,0.038950,0.001760,0.381425,-0.461036,...,0.000069,0.014856,-0.017957,-0.044700,0.000671,-0.000811,-0.002019,-0.175851,-0.437731,5
4,-0.901412,0.560280,-0.034635,0.849475,-0.537296,-0.281390,-0.949706,0.395406,1.004853,0.264209,...,-0.375520,-0.954314,-0.250921,0.242529,0.397325,0.104470,-0.100976,0.265492,-0.256612,6
5,0.212659,0.044742,1.204832,-0.057455,-0.088621,1.240770,1.073916,0.545862,0.077704,0.088964,...,0.586210,0.083448,0.095540,-0.972378,0.042416,0.048562,-0.494252,0.006913,-0.070357,5
6,0.093554,-0.957178,2.196405,-0.471488,-0.458508,-2.861307,-0.384974,-0.493956,-1.776593,0.877568,...,0.190160,0.683943,-0.337841,0.035746,0.877559,-0.433480,0.045865,-1.559080,0.164962,6
7,1.016746,0.839495,1.287463,0.445300,-0.784616,0.553760,1.120977,-0.310065,-0.319645,-0.962507,...,-0.347576,-0.358315,-1.078948,1.717721,0.099111,0.298440,-0.475126,0.307661,-0.489806,6
8,-0.148033,-0.067173,1.287463,0.336862,0.553216,0.809844,1.379812,0.519115,0.011479,-0.086281,...,0.716281,0.015839,-0.119051,-1.137229,0.005959,-0.044790,-0.427850,-0.000990,-0.009461,5
9,0.093554,-0.957178,-0.199897,2.535181,-0.019013,0.362828,0.673898,1.843128,-1.710368,0.264209,...,1.242079,-1.152613,0.178050,-0.774464,-3.152427,0.486972,-2.118179,-0.451895,1.965608,6


In [21]:
clf = LogisticRegression(penalty='l2', C=10, n_jobs=-1)
clf.fit(X_train.iloc[:,:-5], y_train)
y_pred = clf.predict(X_test.iloc[:,:-5])
print("MAE =", mean_absolute_error(y_test, y_pred))
print("Acc. =", accuracy_score(y_test, y_pred))

MAE = 0.539230769231
Acc. = 0.536153846154
