RF:
    default clf:
        - precision = 0.9356
        - recall = 0.867
        - f1(binary) = 0.895
    grid search:
        - on precision:
            precision = 0.92      
            recall = 0.85      
            f1(binary?) = 0.88 
        - on recall:
            precision = 0.86      
            recall = 0.92      
            f1(binary?) = 0.89
        - on f1:
            precision = 0.96      
            recall = 0.92      
            f1(binary?) = 0.94
Rule-based:
    - f1(binary?) = 0.823529411765
    - precision = 0.768292682927

In [115]:
import pandas as pd
import numpy as np
from __future__ import print_function
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score

In [116]:
# columns collected from 14 datasets (D3M seeds, UCI)
original = pd.read_csv("manually_labeled_dataset.csv")
#original

In [117]:
# columns collected from 8 datasets (openML, labels from ARFF)
external = pd.read_csv("testset.csv")
#external

In [135]:
# concat the two datasets
data = pd.concat([original, external],ignore_index=True)

In [136]:
# get rid of the case where ID be marked as categorical
data = data[data['col_name']!='RIVER']
# get rid of binary
data = data[(data.type != 'binary') & (data.type != 'empty')]
# make type other than categorical -> 'not categorical'
data.loc[data['type'] != 'categorical', 'type'] = False
data.loc[data['type'] == 'categorical', 'type'] = True
data['type'] = data['type'].astype(bool)
# dtype to int
data.loc[data['dtype'] == 'int64' , 'dtype'] = 0
data.loc[data['dtype'] == 'float64' , 'dtype'] = 1
data.loc[data['dtype'] == 'object' , 'dtype'] = 2
# 95%in10 to int
data['95%in10'] = data['95%in10'].astype(int) 

In [120]:
#data

In [137]:
data.type.value_counts()

False    315
True      71
Name: type, dtype: int64

In [150]:
features = list(data.drop(['type','col_name'],axis=1).columns)

#y = data["type"].astype(int)
X = data[features]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

scoring = ['precision', 'recall', 'f1']

In [151]:
# default classifier
clf = RandomForestClassifier(random_state=0)
crsv = cross_validate(clf, X_train, y_train, cv=5,scoring=scoring)

In [152]:
print('train_f1_macro: ', crsv['train_f1'].mean())
print('test_f1_macro: ', crsv['test_f1'].mean())
print('train_precision: ', crsv['train_precision'].mean())
print('test_precision: ', crsv['test_precision'].mean())
print('train_recall: ', crsv['train_recall'].mean())
print('test_recall: ', crsv['test_recall'].mean())

train_f1_macro:  0.988732394366
test_f1_macro:  0.895029239766
train_precision:  1.0
test_precision:  0.935555555556
train_recall:  0.977777777778
test_recall:  0.866666666667


In [154]:
for score in scoring:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    CV_clf = GridSearchCV(estimator=RandomForestClassifier(random_state=0), 
                          param_grid=param_grid, cv= 5,scoring=score)

    CV_clf.fit(X_train, y_train)

    print("Best parameters set found on training set:")
    print()
    print(CV_clf.best_params_)
    print()
    print("Grid scores on training set:")
    print()
    means = CV_clf.cv_results_['mean_test_score']
    stds = CV_clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, CV_clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full training set.")
    print("The scores are computed on the full test set.")
    print()
    y_true, y_pred = y_test, CV_clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on training set:

{'max_features': None, 'n_estimators': 3, 'criterion': 'entropy', 'max_depth': 2}

Grid scores on training set:

0.771 (+/-0.292) for {'max_features': 'auto', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 2}
0.788 (+/-0.469) for {'max_features': 'auto', 'n_estimators': 10, 'criterion': 'gini', 'max_depth': 2}
0.782 (+/-0.487) for {'max_features': 'auto', 'n_estimators': 50, 'criterion': 'gini', 'max_depth': 2}
0.801 (+/-0.418) for {'max_features': 'auto', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': 2}
0.860 (+/-0.277) for {'max_features': 'auto', 'n_estimators': 200, 'criterion': 'gini', 'max_depth': 2}
0.771 (+/-0.292) for {'max_features': 'log2', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 2}
0.788 (+/-0.469) for {'max_features': 'log2', 'n_estimators': 10, 'criterion': 'gini', 'max_depth': 2}
0.782 (+/-0.487) for {'max_features': 'log2', 'n_estimators': 50, 'criterion': 'g

Best parameters set found on training set:

{'max_features': 'auto', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 4}

Grid scores on training set:

0.822 (+/-0.606) for {'max_features': 'auto', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 2}
0.800 (+/-0.586) for {'max_features': 'auto', 'n_estimators': 10, 'criterion': 'gini', 'max_depth': 2}
0.733 (+/-0.537) for {'max_features': 'auto', 'n_estimators': 50, 'criterion': 'gini', 'max_depth': 2}
0.756 (+/-0.453) for {'max_features': 'auto', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': 2}
0.800 (+/-0.356) for {'max_features': 'auto', 'n_estimators': 200, 'criterion': 'gini', 'max_depth': 2}
0.822 (+/-0.606) for {'max_features': 'log2', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 2}
0.800 (+/-0.586) for {'max_features': 'log2', 'n_estimators': 10, 'criterion': 'gini', 'max_depth': 2}
0.733 (+/-0.537) for {'max_features': 'log2', 'n_estimators': 50, 'criterion': 'gini', 'max_depth': 2}
0.756 (+/-0.453) for

Best parameters set found on training set:

{'max_features': 'auto', 'n_estimators': 200, 'criterion': 'gini', 'max_depth': 4}

Grid scores on training set:

0.780 (+/-0.476) for {'max_features': 'auto', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 2}
0.789 (+/-0.524) for {'max_features': 'auto', 'n_estimators': 10, 'criterion': 'gini', 'max_depth': 2}
0.752 (+/-0.507) for {'max_features': 'auto', 'n_estimators': 50, 'criterion': 'gini', 'max_depth': 2}
0.774 (+/-0.424) for {'max_features': 'auto', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': 2}
0.823 (+/-0.283) for {'max_features': 'auto', 'n_estimators': 200, 'criterion': 'gini', 'max_depth': 2}
0.780 (+/-0.476) for {'max_features': 'log2', 'n_estimators': 3, 'criterion': 'gini', 'max_depth': 2}
0.789 (+/-0.524) for {'max_features': 'log2', 'n_estimators': 10, 'criterion': 'gini', 'max_depth': 2}
0.752 (+/-0.507) for {'max_features': 'log2', 'n_estimators': 50, 'criterion': 'gini', 'max_depth': 2}
0.774 (+/-0.424) f

In [126]:
#X_train

In [127]:
#X_test

In [153]:
## 95%in10 rule
data['95%in10'] = data['95%in10'].astype(bool)
print('f1', f1_score(data['type'].astype(bool), data['95%in10']))
print('precision', precision_score(data['type'].astype(bool), data['95%in10'], average='binary'))

f1 0.823529411765
precision 0.768292682927


In [129]:
tn, fp, fn, tp = confusion_matrix(data['type'].astype(bool), data['95%in10']).ravel()
print((tn, fp, fn, tp))
# false negative
data.loc[data.type &(data['95%in10']==False)]

(296, 19, 8, 63)


Unnamed: 0,col_name,nunique,nunique_ratio,H,M,L,ratio_H,ratio_M,ratio_L,dropMean,dropMedian,dropMax,dropMin,dtype,95%in10,type
2,education-num,16,0.000491,0,0,5,0.0,0.0,1.0,0.0695,0.0555,0.1596,0.0,0,False,True
4,education,16,0.000491,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,True
6,occupation,15,0.000461,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,True
71,model,13,0.044369,0,3,4,0.0,0.4286,0.5714,0.2162,0.2524,0.3844,0.0,0,False,True
328,A5,14,0.02029,0,1,8,0.0,0.1111,0.8889,0.11,0.0519,0.3225,0.0,0,False,True
357,IDENTIF,105,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,True
399,river,42,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,True
400,country,26,0.619048,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,True


In [130]:
# false postive
data.loc[(data.type==False) & data['95%in10']]

Unnamed: 0,col_name,nunique,nunique_ratio,H,M,L,ratio_H,ratio_M,ratio_L,dropMean,dropMedian,dropMax,dropMin,dtype,95%in10,type
12,capital-gain,119,0.003655,0,0,5,0.0,0.0,1.0,0.0637,0.0725,0.118,0.0,0,True,False
13,capital-loss,92,0.002825,0,0,5,0.0,0.0,1.0,0.0407,0.0534,0.0677,0.0,0,True,False
37,UniqueHotkeys,11,0.003295,0,6,13,0.0,0.3158,0.6842,0.1884,0.1661,0.3894,0.0,0,True,False
39,UniqueUnitsMade,12,0.003595,2,2,15,0.1053,0.1053,0.7895,0.1654,0.125,0.5538,0.0,0,True,False
307,bars,5,0.025773,0,1,11,0.0,0.0833,0.9167,0.0737,0.0677,0.1732,-0.0106,0,True,False
308,stripes,12,0.061856,0,1,11,0.0,0.0833,0.9167,0.0802,0.0829,0.2228,-0.0187,0,True,False
309,colours,8,0.041237,0,0,12,0.0,0.0,1.0,0.1057,0.0878,0.2582,-0.0079,0,True,False
310,circles,4,0.020619,0,0,12,0.0,0.0,1.0,0.0198,0.003,0.0678,-0.0,0,True,False
311,crosses,3,0.015464,1,0,11,0.0833,0.0,0.9167,0.0501,0.0481,0.1503,0.0,0,True,False
312,quarters,3,0.015464,1,0,11,0.0833,0.0,0.9167,0.0158,0.0109,0.0428,0.0,0,True,False
