In this notebook we optimize the parameters of a simple Logistic Regression classifier.

In [332]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations_with_replacement
from functools import reduce
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix

# Red Wine

In [333]:
# parameters for feature generation
degree = 4
n_components = 30
scale = False
# parameters for training
penalty = 'l2'               # 'l1' or 'l2'
C = .1                      # any C>=0
class_weight = None          # 'balanced' or 'None'
max_iter = 10000              # default is 100
solver = 'sag'               # 'newton-cg', 'lbfgs', 'sag' or 'liblinear'
multi_class = 'multinomial'  # 'multinomial' or 'ovr'

In [334]:
data = pd.read_csv("data/red_normal.csv")
#data = pd.read_csv("data/red_data.csv")
#data = pd.read_csv("data/winequality-red.csv", sep=";")
train_index = data.sample(frac=0.8).sort_index().index
test_index = ~data.index.isin(train_index)

In [335]:
X, y = data.ix[:,:-1], data["quality"]

In [336]:
if True:
    old_features = [name for name in X.columns if not 'type' in name]
    for combination in combinations_with_replacement(old_features, degree):
        temp = reduce(lambda x, y: x*y, map(lambda x: X[x], combination))
        X.insert(X.shape[1]-1, '*'.join(combination), temp)   
if True:
    X = (X - X.mean())/X.std()
if True:
    pls = PLSRegression(n_components=n_components, scale=scale)
    pls.fit(X, y)
    X = pd.DataFrame(pls.transform(X))

In [339]:
clf = LogisticRegression(penalty=penalty, 
                   C=C,
                   class_weight=class_weight,
                   max_iter=max_iter,
                   solver=solver,
                   multi_class=multi_class)
clf.fit(X.loc[train_index], y.loc[train_index])
y_pred = clf.predict(X.loc[test_index])
print(mean_absolute_error(y.loc[test_index], y_pred))
print(accuracy_score(y.loc[test_index], y_pred))
confusion_matrix(y.loc[test_index], y_pred)

0.290625
0.71875


array([[  0,   4,   1,   0,   0,   0],
       [  0,   2,   7,   0,   0,   0],
       [  0,   0, 104,  21,   0,   0],
       [  0,   0,  28,  99,   3,   0],
       [  0,   0,   1,  18,  25,   0],
       [  0,   0,   0,   1,   6,   0]])

## White wine

In [340]:
# parameters for feature generation
degree = 4
n_components = 30
scale = False
# parameters for training
penalty = 'l2'               # 'l1' or 'l2'
C = .1                      # any C>=0
class_weight = None          # 'balanced' or 'None'
max_iter = 10000              # default is 100
solver = 'sag'               # 'newton-cg', 'lbfgs', 'sag' or 'liblinear'
multi_class = 'multinomial'  # 'multinomial' or 'ovr'

In [341]:
data = pd.read_csv("data/white_normal.csv")
#data = pd.read_csv("data/white_data.csv")
train_index = data.sample(frac=0.8).sort_index().index
test_index = ~data.index.isin(train_index)

In [342]:
X, y = data.ix[:,:-1], data["quality"]

In [343]:
if True:
    old_features = [name for name in X.columns if not 'type' in name]
    for combination in combinations_with_replacement(old_features, degree):
        temp = reduce(lambda x, y: x*y, map(lambda x: X[x], combination))
        X.insert(X.shape[1]-1, '*'.join(combination), temp)       
if True:
    X = (X - X.mean())/X.std()
if True:
    pls = PLSRegression(n_components=n_components, scale=scale)
    pls.fit(X, y)
    X = pd.DataFrame(pls.transform(X))

In [344]:
clf = LogisticRegression(penalty=penalty, 
                   C=C,
                   class_weight=class_weight,
                   max_iter=max_iter,
                   solver=solver,
                   multi_class=multi_class)
clf.fit(X.loc[train_index], y.loc[train_index])
y_pred = clf.predict(X.loc[test_index])
print(mean_absolute_error(y.loc[test_index], y_pred))
print(accuracy_score(y.loc[test_index], y_pred))
confusion_matrix(y.loc[test_index], y_pred)

0.442857142857
0.587755102041


array([[  0,   2,   1,   0,   0,   0,   0],
       [  1,   6,  24,   6,   0,   0,   0],
       [  1,   3, 184,  97,   2,   0,   0],
       [  0,   1,  79, 324,  41,   0,   0],
       [  0,   0,   1, 109,  61,   0,   0],
       [  0,   0,   1,  12,  20,   1,   0],
       [  0,   0,   0,   1,   2,   0,   0]])

## All Wines

In [345]:
# parameters for feature generation
degree = 4
n_components = 30
scale = False
# parameters for training
penalty = 'l2'               # 'l1' or 'l2'
C = .1                      # any C>=0
class_weight = None          # 'balanced' or 'None'
max_iter = 10000              # default is 100
solver = 'sag'               # 'newton-cg', 'lbfgs', 'sag' or 'liblinear'
multi_class = 'multinomial'  # 'multinomial' or 'ovr'

In [346]:
data = pd.read_csv("data/wine_normal.csv")
#data = pd.read_csv("data/wine_data.csv")
train_index = data.sample(frac=0.8).sort_index().index
test_index = ~data.index.isin(train_index)

In [347]:
X, y = data.ix[:,:-1], data["quality"]

In [348]:
if True:
    old_features = [name for name in X.columns if not 'type' in name]
    for combination in combinations_with_replacement(old_features, degree):
        temp = reduce(lambda x, y: x*y, map(lambda x: X[x], combination))
        X.insert(X.shape[1]-1, '*'.join(combination), temp)       
if True:
    X = (X - X.mean())/X.std()
if True:
    pls = PLSRegression(n_components=n_components, scale=scale)
    pls.fit(X, y)
    X = pd.DataFrame(pls.transform(X))

In [349]:
clf = LogisticRegression(penalty=penalty, 
                   C=C,
                   class_weight=class_weight,
                   max_iter=max_iter,
                   solver=solver,
                   multi_class=multi_class)
clf.fit(X.loc[train_index], y.loc[train_index])
y_pred = clf.predict(X.loc[test_index])
print(mean_absolute_error(y.loc[test_index], y_pred))
print(accuracy_score(y.loc[test_index], y_pred))
confusion_matrix(y.loc[test_index], y_pred)

0.411855273287
0.608160123172


array([[  1,   3,   1,   1,   0,   0,   0],
       [  1,   7,  32,   7,   0,   0,   0],
       [  0,   5, 296, 122,   3,   0,   0],
       [  0,   0, 126, 413,  37,   1,   0],
       [  0,   0,   5, 134,  71,   1,   0],
       [  0,   0,   0,   7,  22,   2,   0],
       [  0,   0,   0,   0,   0,   1,   0]])