In [13]:
#statistical
import pandas as pd
import numpy as np

#dataset
from sklearn.datasets import load_iris

#mchine learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score, r2_score, accuracy_score

#graphing and display
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

#feature engineer
import feature_engineer as fe

In [14]:
def display_metrics(y_test, y_pred):
    print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
    print('Precision: {:.2f}'.format(precision_score(y_test, y_pred, average = 'weighted')))
    print('Recall: {:.2f}'.format(recall_score(y_test, y_pred, average = 'weighted')))
    print('F1 score: {:.2f}'.format(f1_score(y_test, y_pred, average = 'weighted')))

def train_and_predict(model, X_train, X_test, y_train):
    logistic_clf.fit(X_train, y_train)
    y_pred = logistic_clf.predict(X_test)
    return y_pred

In [15]:
iris = load_iris(return_X_y = True)
iris_x = pd.DataFrame(iris[0], columns = ['col_' + str(i) for i in range(iris[0].shape[1])])
iris_y = iris[1]

In [16]:
random_seed = 42
test_size = 0.3
splitter_kwargs = {'test_size' : 0.3, 'random_state' : 42}
scorer_kwargs = {'average' : 'weighted'}

In [17]:
logistic_clf = LogisticRegression()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(iris_x, iris_y, test_size = 0.3, random_state = 42)

In [19]:
y_pred = train_and_predict(logistic_clf, X_train, X_test, y_train)
display_metrics(y_test, y_pred)

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 score: 0.98


In [20]:
eng = fe.FeatureEngineer(logistic_clf, f1_score, train_test_split, n_best = 15,
                         scorer_kwargs = scorer_kwargs, splitter_kwargs = splitter_kwargs)
eng.fit(X_train, y_train)

  'precision', 'predicted', average, warn_for)


In [21]:
for i, s in enumerate(sorted(eng.best_columns, reverse = True)):
    print(i, s, eng.best_columns[s]['column_name'])

0 0.937215909091 col_1_truediv_col_3
1 0.90625 col_3_sub_col_1
2 0.874431818182 col_0_truediv_col_3
3 0.871527777778 col_2_sub_col_0
4 0.836397058824 col_3_mul_col_2
5 0.758564814815 col_3_mul_col_0
6 0.716635338346 col_2_truediv_col_3
7 0.714285714286 col_3_add_col_2
8 0.689583333333 col_3_mul_col_1
9 0.642857142857 col_2_mul_col_1
10 0.616558908046 col_3_add_col_0
11 0.609375 col_3_truediv_col_1
12 0.537137681159 col_3_sub_col_2
13 0.522693452381 col_0_truediv_col_1
14 0.498768939394 col_1_truediv_col_0


In [22]:
for df in (X_train, X_test):
    df = eng.transform(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[trans['column_name']] = trans['transformation_function'](df[trans['x1']], df[trans['x2']])


In [23]:
X_test.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_3_add_col_2,col_0_truediv_col_1,col_3_truediv_col_1,col_2_sub_col_0,col_3_add_col_0,col_3_mul_col_0,col_0_truediv_col_3,col_1_truediv_col_0,col_2_mul_col_1,col_3_mul_col_2,col_3_mul_col_1,col_3_sub_col_1,col_1_truediv_col_3,col_3_sub_col_2,col_2_truediv_col_3
73,6.1,2.8,4.7,1.2,5.9,2.178571,0.428571,-1.4,7.3,7.32,5.083333,0.459016,13.16,5.64,3.36,-1.6,2.333333,-3.5,3.916667
18,5.7,3.8,1.7,0.3,2.0,1.5,0.078947,-4.0,6.0,1.71,19.0,0.666667,6.46,0.51,1.14,-3.5,12.666667,-1.4,5.666667
118,7.7,2.6,6.9,2.3,9.2,2.961538,0.884615,-0.8,10.0,17.71,3.347826,0.337662,17.94,15.87,5.98,-0.3,1.130435,-4.6,3.0
78,6.0,2.9,4.5,1.5,6.0,2.068966,0.517241,-1.5,7.5,9.0,4.0,0.483333,13.05,6.75,4.35,-1.4,1.933333,-3.0,3.0
76,6.8,2.8,4.8,1.4,6.2,2.428571,0.5,-2.0,8.2,9.52,4.857143,0.411765,13.44,6.72,3.92,-1.4,2.0,-3.4,3.428571


In [24]:
y_pred = train_and_predict(logistic_clf, X_train, X_test, y_train)
display_metrics(y_test, y_pred)

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 score: 1.00
