In [10]:
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import sys
import matplotlib.pyplot as plt 
%matplotlib inline

# fixed random seed
np.random.seed(1)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def label_enc(labels):
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le

class DataFrameImputer(TransformerMixin):

    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        #print('self.fill is \n',self.fill,'\n\n')
        return X.fillna(self.fill)

path = 'australian.arff'
dataset = arff.loadarff(path)
# print(dataset[0])
data = pd.DataFrame(dataset[0])
print(data)
attr = np.array(data.columns)
print(attr)
# #print(data,'\n\n')
# data = DataFrameImputer().fit_transform(data).values
# print(data)
# for c in data:
#     if (data[c].dtype==np.dtype('O')):
#         print(c,'  ',data[c].value_counts())

       A1     A2      A3    A4     A5    A6     A7    A8    A9   A10   A11  \
0    b'1'  22.08  11.460  b'2'   b'4'  b'4'  1.585  b'0'  b'0'   0.0  b'1'   
1    b'0'  22.67   7.000  b'2'   b'8'  b'4'  0.165  b'0'  b'0'   0.0  b'0'   
2    b'0'  29.58   1.750  b'1'   b'4'  b'4'  1.250  b'0'  b'0'   0.0  b'1'   
3    b'0'  21.67  11.500  b'1'   b'5'  b'3'  0.000  b'1'  b'1'  11.0  b'1'   
4    b'1'  20.17   8.170  b'2'   b'6'  b'4'  1.960  b'1'  b'1'  14.0  b'0'   
..    ...    ...     ...   ...    ...   ...    ...   ...   ...   ...   ...   
685  b'1'  31.57  10.500  b'2'  b'14'  b'4'  6.500  b'1'  b'0'   0.0  b'0'   
686  b'1'  20.67   0.415  b'2'   b'8'  b'4'  0.125  b'0'  b'0'   0.0  b'0'   
687  b'0'  18.83   9.540  b'2'   b'6'  b'4'  0.085  b'1'  b'0'   0.0  b'0'   
688  b'0'  27.42  14.500  b'2'  b'14'  b'8'  3.085  b'1'  b'1'   1.0  b'0'   
689  b'1'  41.00   0.040  b'2'  b'10'  b'4'  0.040  b'0'  b'1'   1.0  b'0'   

      A12    A13     A14   A15  
0    b'2'  100.0  1213.0  b'0'

In [2]:
masks = []
for i in range(len(attr)-1):
    if isinstance(attr[i][1],int):
        masks.append(i)
print(masks)

[]


In [3]:
X = data[:,0:data.shape[1]-1]
lenc = label_enc(data[:,data.shape[1]-1])
y = lenc.transform(data[:,data.shape[1]-1])
print(y)

[0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1
 1 1 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 0 1
 1 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 1 0 0
 0 0 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0
 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1
 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0
 1 0 0 0 0 1 1 1 1 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0
 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0
 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 1 1 1
 0 0 0 1 0 1 0 0 0 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1
 1 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1
 0 1 0 0 0 1 1 1 0 0 1 0 1 1 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 1 0 1 0 0 1
 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 1
 0 0 0 1 1 1 1 1 0 1 1 0 

In [11]:
np.arange(2,21)

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20])