In [1]:
#statistical
import pandas as pd
import numpy as np

#dataset
from sklearn.datasets import load_iris

#mchine learning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

#graphing and display
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

#other
from collections import deque
from operator import add, mul, sub, truediv

In [2]:
dataset = load_iris(return_X_y = True)

In [3]:
X = pd.DataFrame(dataset[0], columns = ['col_' + str(i) for i in range(dataset[0].shape[1])])
y = dataset[1]

In [4]:
X.head()

Unnamed: 0,col_0,col_1,col_2,col_3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [6]:
logistic_clf = LogisticRegression()
logistic_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
y_pred = logistic_clf.predict(X_test)
y_pred_proba = logistic_clf.predict_proba(X_test)

In [8]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred, average = 'weighted')))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred, average = 'weighted')))
print('F1 score: {:.2f}'.format(f1_score(y_test, y_pred, average = 'weighted')))
print('Log loss: {:.2f}'.format(log_loss(y_test, y_pred_proba)))

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 score: 0.98
Log loss: 0.30


In [9]:
for c in X_train.columns:
    logistic_clf.fit(X_train[c].values.reshape(-1, 1), y_train)
    y_pred = logistic_clf.predict(X_test[c].values.reshape(-1, 1))
    y_pred_proba = logistic_clf.predict_proba(X_test[c].values.reshape(-1, 1))
    print('metrics for %s: \n========================' % c)
    print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
    print('Precision: {:.2f}'.format(precision_score(y_test, y_pred, average = 'weighted')))
    print('Recall: {:.2f}'.format(recall_score(y_test, y_pred, average = 'weighted')))
    print('F1 score: {:.2f}'.format(f1_score(y_test, y_pred, average = 'weighted')))
    print('Log loss: {:.2f}'.format(log_loss(y_test, y_pred_proba)))
    print('========================')

metrics for col_0: 
Accuracy: 0.56
Precision: 0.63
Recall: 0.56
F1 score: 0.54
Log loss: 0.95
metrics for col_1: 
Accuracy: 0.56
Precision: 0.54
Recall: 0.56
F1 score: 0.55
Log loss: 1.00
metrics for col_2: 
Accuracy: 0.78
Precision: 0.87
Recall: 0.78
F1 score: 0.74
Log loss: 0.54
metrics for col_3: 
Accuracy: 0.82
Precision: 0.89
Recall: 0.82
F1 score: 0.80
Log loss: 0.53


In [10]:
#from all column combinations, pick the strongest ones
#try adding combined columns to the existing dataset, if it becomes stronger, continue else, remove

In [11]:
best_n = 5
best_cols = deque(maxlen = best_n)

columns = X_train.columns 
best_f1 = 0
operators = {'add' : add, 'mul' : mul, 'sub' : sub, 'div' : truediv}

for c1 in columns:
    for c2 in columns:
        if c1 != c2:
            for o_n, o in operators.items():
                colname = '%s_%s_%s' % (c1, o_n, c2)
                res = pd.DataFrame(o(X[c1], X[c2]), columns = [colname])
                res_train, res_test, y_train, y_test = train_test_split(res, y, test_size = 0.3, random_state = 42)
                clf = LogisticRegression().fit(res_train, y_train)
                y_pred = clf.predict(res_test)
                y_pred_proba = clf.predict_proba(res_test)
                current_f1 = f1_score(y_test, y_pred, average = 'weighted')
                
                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_cols.append(res)

  'precision', 'predicted', average, warn_for)


In [12]:
for c in best_cols:
    display(c.head(2))

Unnamed: 0,col_0_sub_col_1
0,1.6
1,1.9


Unnamed: 0,col_0_mul_col_2
0,7.14
1,6.86


Unnamed: 0,col_0_sub_col_2
0,3.7
1,3.5


Unnamed: 0,col_0_div_col_2
0,3.642857
1,3.5


Unnamed: 0,col_0_div_col_3
0,25.5
1,24.5


In [13]:
for c in best_cols:
    X = X.join(c)

In [14]:
X.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_0_sub_col_1,col_0_mul_col_2,col_0_sub_col_2,col_0_div_col_2,col_0_div_col_3
0,5.1,3.5,1.4,0.2,1.6,7.14,3.7,3.642857,25.5
1,4.9,3.0,1.4,0.2,1.9,6.86,3.5,3.5,24.5
2,4.7,3.2,1.3,0.2,1.5,6.11,3.4,3.615385,23.5
3,4.6,3.1,1.5,0.2,1.5,6.9,3.1,3.066667,23.0
4,5.0,3.6,1.4,0.2,1.4,7.0,3.6,3.571429,25.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
logistic_clf = LogisticRegression()
logistic_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
y_pred = logistic_clf.predict(X_test)
y_pred_proba = logistic_clf.predict_proba(X_test)

In [17]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred, average = 'weighted')))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred, average = 'weighted')))
print('F1 score: {:.2f}'.format(f1_score(y_test, y_pred, average = 'weighted')))
print('Log loss: {:.2f}'.format(log_loss(y_test, y_pred_proba)))

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 score: 1.00
Log loss: 0.11


In [18]:
for c1 in range(dataset[0].shape[1]):
    for c2 in range(dataset[0].shape[1]):
        if c1 is not c2:
            for o_n, o in operators.items():
                res = o(dataset[0][:,[c1]], dataset[0][:,[c2]])
                res_train, res_test, y_train, y_test = train_test_split(res, y, test_size = 0.3, random_state = 42)
                clf = LogisticRegression().fit(res_train, y_train)
                y_pred = clf.predict(res_test)
                y_pred_proba = clf.predict_proba(res_test)
                current_f1 = f1_score(y_test, y_pred, average = 'weighted')
                
                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_cols.append(res)

  'precision', 'predicted', average, warn_for)


In [19]:
new_data = dataset[0]

for c in best_cols:
    new_data = np.append(new_data, c, axis = 1)

In [20]:
print(new_data[:2,:])

[[  5.1          3.5          1.4          0.2          1.6          7.14
    3.7          3.64285714  25.5       ]
 [  4.9          3.           1.4          0.2          1.9          6.86
    3.5          3.5         24.5       ]]


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
logistic_clf = LogisticRegression()
logistic_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
y_pred = logistic_clf.predict(X_test)
y_pred_proba = logistic_clf.predict_proba(X_test)

In [23]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred, average = 'weighted')))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred, average = 'weighted')))
print('F1 score: {:.2f}'.format(f1_score(y_test, y_pred, average = 'weighted')))
print('Log loss: {:.2f}'.format(log_loss(y_test, y_pred_proba)))

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 score: 1.00
Log loss: 0.11
