In [1]:
import pandas as pd
import numpy as np
from __future__ import print_function
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.tree import export_graphviz

In [2]:
# columns collected from 14 datasets (D3M seeds, UCI)
original = pd.read_csv("manually_labeled_dataset.csv")

# columns collected from 8 datasets (openML, labels from ARFF)
external = pd.read_csv("testset.csv")

# concat the two datasets
data = pd.concat([original, external],ignore_index=True)

# get rid of the case where ID be marked as categorical
data = data[data['col_name']!='RIVER']
# get rid of binary
data = data[(data.type != 'binary') & (data.type != 'empty')]
# make type other than categorical -> 'not categorical'
data.loc[data['type'] != 'categorical', 'type'] = False
data.loc[data['type'] == 'categorical', 'type'] = True
data['type'] = data['type'].astype(bool)
# dtype to int
data.loc[data['dtype'] == 'int64' , 'dtype'] = 0
data.loc[data['dtype'] == 'float64' , 'dtype'] = 1
data.loc[data['dtype'] == 'object' , 'dtype'] = 2
# 95%in10 to int
data['95%in10'] = data['95%in10'].astype(int) 

data.type.value_counts()

False    315
True      71
Name: type, dtype: int64

In [3]:
data.loc[data['col_name']=='river', 'type'] = False
data.loc[data['col_name']=='IDENTIF', 'type'] = False

In [33]:
# rules
def cmp(dtype,nunique,nunique_ratio,dropMax,H,M):
    if dtype == 1:
        return False
    elif dtype == 2:
        if nunique < 50 and nunique_ratio < 0.7:
            return True
        else:
            return False
    else:
        if nunique > 16:
            return False
        else:
            if dropMax<= 0.05:
                return True
            else:
                if H+M < 1:
                    return True
                else:
                    if nunique <= 10:
                        return True
                    return False

In [6]:
data['pred'] = data.apply(lambda x: cmp(x['dtype'], x['nunique'],x['nunique_ratio'],x['dropMax'],x['H'],x['M']), axis=1)

In [7]:
a = data[data['pred']==False]
b = a[a['nunique']<=10]
b[b['type']==False]

Unnamed: 0,col_name,nunique,nunique_ratio,H,M,L,ratio_H,ratio_M,ratio_L,dropMean,dropMedian,dropMax,dropMin,dtype,95%in10,type,pred
432,diffSaTime2,3,0.000181,0,1,17,0.0,0.0556,0.9444,0.009,0.0132,0.0178,0.0,1,1,False,False
434,diffSaTime4,3,0.000181,0,1,17,0.0,0.0556,0.9444,0.0072,0.0036,0.0154,0.0,1,1,False,False


In [8]:
print('f1', f1_score(data['type'].astype(bool), data['pred']))
print('precision', precision_score(data['type'], data['pred'], average='binary'))

f1 0.91156462585
precision 0.858974358974


In [9]:
tn, fp, fn, tp = confusion_matrix(data['type'], data['pred']).ravel()
print((tn, fp, fn, tp))

(306, 11, 2, 67)


In [10]:
confusion_matrix(data['type'],data['pred'])

array([[306,  11],
       [  2,  67]])

In [26]:
test = pd.read_csv('/home/rpedsel/Documents/ISI II/datasets/CAT exp/newtestset.csv')
test = test[(test.type != 'binary') & (test.type != 'empty')]
# make type other than categorical -> 'not categorical'
test.loc[test['type'] != 'categorical', 'type'] = False
test.loc[test['type'] == 'categorical', 'type'] = True
test['type'] =test['type'].astype(bool)
# dtype to int
test.loc[test['dtype'] == 'int64' , 'dtype'] = 0
test.loc[test['dtype'] == 'float64' , 'dtype'] = 1
test.loc[test['dtype'] == 'object' , 'dtype'] = 2
# 95%in10 to int
test['95%in10'] = test['95%in10'].astype(int) 

test.type.value_counts()

False    103
True      48
Name: type, dtype: int64

In [37]:
test['pred'] = True
test['pred'] = test.apply(lambda x: cmp(x['dtype'], x['nunique'],x['nunique_ratio'],x['dropMax'],x['H'],x['M']), axis=1)

In [38]:
print('f1', f1_score(test['type'].astype(bool), test['pred']))
print('precision', precision_score(test['type'], test['pred'], average='binary'))

f1 0.897959183673
precision 0.88
