In [1]:
%matplotlib inline
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
from ruruki.graphs import Graph

In [3]:
#Attribute Information:
#1. Number of times pregnant 
#2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
#3. Diastolic blood pressure (mm Hg) 
#4. Triceps skin fold thickness (mm) 
#5. 2-Hour serum insulin (mu U/ml) 
#6. Body mass index (weight in kg/(height in m)^2) 
#7. Diabetes pedigree function 
#8. Age (years) 
#9. Class variable (0 or 1) 

In [4]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
diabetes = pd.read_csv("pima-indians-diabetes.data", names=names)
diabetes.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
diabetes.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [6]:
diabetes.groupby(['class'])['class'].count()

class
0    500
1    268
Name: class, dtype: int64

In [7]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
from scipy.stats import zscore

X = diabetes[['plas','mass']]
X = diabetes[['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi','age']].apply(zscore)

In [9]:
X

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
5,0.342981,-0.153185,0.253036,-1.288212,-0.692891,-0.811341,-0.818079,-0.275760
6,-0.250952,-1.342476,-0.987710,0.719086,0.071204,-0.125977,-0.676133,-0.616111
7,1.827813,-0.184482,-3.572597,-1.288212,-0.692891,0.419775,-1.020427,-0.360847
8,-0.547919,2.381884,0.046245,1.534551,4.021922,-0.189437,-0.947944,1.681259
9,1.233880,0.128489,1.390387,-1.288212,-0.692891,-4.060474,-0.724455,1.766346


In [10]:
y = diabetes['class']

In [11]:
y

0      1
1      0
2      1
3      0
4      1
5      0
6      1
7      0
8      1
9      1
10     0
11     1
12     0
13     1
14     1
15     1
16     1
17     1
18     0
19     1
20     0
21     0
22     1
23     1
24     1
25     1
26     1
27     0
28     0
29     0
      ..
738    0
739    1
740    1
741    0
742    0
743    1
744    0
745    0
746    1
747    0
748    1
749    1
750    1
751    0
752    0
753    1
754    1
755    1
756    0
757    1
758    0
759    1
760    0
761    1
762    0
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 768, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [13]:
import numpy as np
X_train = np.array(X_train)

In [14]:
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [15]:
min = X_train.min(axis=0)

In [16]:
min[0]

-1.1418515161634994

In [17]:
max = X_train.max(axis=0)

In [18]:
max[0]

3.906578350084603

In [19]:
cols = X_train.shape[-1]

In [20]:
cols

8

In [21]:
## this a python function to create the buckets given 
## min, max and num of buckets

def get_buckets(min, max, num_buckets):
    buckets = []
    bucket_size= (max-min)/num_buckets
    for i in range(1,num_buckets):
        buckets.append((min+(i-1)*bucket_size))
    buckets.append(max)    
    return buckets  

In [22]:
bins = []
for i in range (0,cols):
    bins.append(get_buckets(X_train.min(axis=0)[i],X_train.max(axis=0)[i],1000))
#bins   

In [23]:
bins[0][0]

-1.1418515161634994

In [24]:
def find_bin(bins,value):
    first_bin = bins[0]
    for i in range (1,len(bins)):
        if (value >= first_bin and value <= bins[i]):
            return i
        else:
            first_bin = bins[i]
    return -1       

In [25]:
find_bin(bins[0],0)

227

In [26]:
graph = Graph()

In [27]:
rows = X_train.shape[0]

In [28]:
rows

537

In [29]:
X_train[0][0]

-1.1418515161634994

In [30]:
cc= graph.get_or_create_vertex("mdata", wcol=-1, cbin=-1)

In [31]:
print(cc.as_dict().get("count"))

None


In [32]:
rows 

537

In [33]:
cols

8

In [34]:
response_classes = np.array(list(set(y_train)))
print(len(response_classes))

2


In [35]:

for i in range (0,rows):
    for j in range (0,cols):
        wbin=find_bin(bins[j],X_train[i][j])
        v = graph.get_or_create_vertex("data",col=j,cbin=wbin)
        out = graph.get_or_create_vertex("outc",out=y_train[i])
        #print(out)
        edge = graph.get_or_create_edge(v, "outcome", out)
        if(edge.as_dict().get('properties').get("count") == None):
            edge.set_property(count=1)
        else:
            cc = int(str(edge.as_dict().get('properties').get("count")))+1
            edge.set_property(count=cc)
                

In [36]:
graph.get_vertices(label="outc",out=1)

<EntitySet> items: 1

In [37]:
graph.get_vertices(label="data")

<EntitySet> items: 952

In [38]:
graph.get_edges()

<EntitySet> items: 1344

In [39]:
graph.get_or_create_vertex("outc",row=0,col=0,cbin=1,out="yes")

<Vertex> ident: 955, label: outc, properties: {'row': 0, 'col': 0, 'cbin': 1, 'out': 'yes'}

In [40]:
graph.get_or_create_vertex("outc",row=0,col=0,cbin=1,out="yes")

<Vertex> ident: 955, label: outc, properties: {'row': 0, 'col': 0, 'cbin': 1, 'out': 'yes'}

In [41]:
#graph.get_or_create_vertex("outc",col=0,cbin=1,out="yes")

In [42]:
result = graph.get_vertices("data",col=0)

In [43]:
#print(result.all())

In [44]:
result

<EntitySet> items: 17

In [45]:
result = graph.get_edges(label="outcome",count__gt=1)
#print(result.all())

In [46]:
result = graph.get_edges(label="outcome")
result

<EntitySet> items: 1344

In [47]:
def get_index(y, response_classes):
    for i in range (0,len(response_classes)):
        if(y == response_classes[i]):
            return i

In [48]:
get_index(0,response_classes)

0

In [49]:
get_index(1,response_classes)

1

In [50]:
import random 

print(response_classes)
    
y_pred =[]
for i in range (0,X_test.shape[0]):
    count= np.zeros(len(response_classes))
    totv = []
    for j in range (0,X_test.shape[-1]):
        wbin=find_bin(bins[j],X_test[i][j])
        for v in graph.get_vertices("data",col=j,cbin=wbin):
            totv.append(v)
    for r in totv:
        mv = r
        print(mv.as_dict().get('properties'))
        for m in mv.get_out_edges():
                #print(m.as_dict().get('properties'))
                #print(m.get_out_vertex().as_dict())
                indx = int(str(m.get_out_vertex().as_dict().get('properties').get("out")))
                count[indx] = count[indx]+int(str(m.as_dict().get('properties').get("count")))
                #print("index"+str(indx)+" --"+str(count[indx]))
    print(count)
    for ii in range (0,len(count)):
        if(count[ii] == count.max()):
                print(response_classes[ii])
                y_pred.append(response_classes[ii])
                break
    

[0 1]
{'col': 0, 'cbin': 59}
{'col': 2, 'cbin': 623}
{'col': 3, 'cbin': 435}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 640}
{'col': 7, 'cbin': 17}
[288. 138.]
0
{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 541}
{'col': 2, 'cbin': 607}
{'col': 3, 'cbin': 304}
{'col': 4, 'cbin': 119}
{'col': 5, 'cbin': 501}
{'col': 6, 'cbin': 137}
{'col': 7, 'cbin': 34}
[129.  47.]
0
{'col': 0, 'cbin': 236}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 507}
{'col': 7, 'cbin': 67}
[318. 189.]
0
{'col': 0, 'cbin': 295}
{'col': 1, 'cbin': 839}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 192}
{'col': 4, 'cbin': 207}
{'col': 5, 'cbin': 385}
{'col': 6, 'cbin': 216}
{'col': 7, 'cbin': 500}
[58. 34.]
0
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 561}
{'col': 2, 'cbin': 533}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 367}
{'col': 7, 'cbin': 167}
[322. 197.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 410}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 182}
{'col':

{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 995}
{'col': 2, 'cbin': 574}
{'col': 3, 'cbin': 455}
{'col': 5, 'cbin': 455}
{'col': 6, 'cbin': 32}
{'col': 7, 'cbin': 534}
[95. 36.]
0
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 516}
{'col': 2, 'cbin': 615}
{'col': 3, 'cbin': 233}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 1}
{'col': 6, 'cbin': 209}
{'col': 7, 'cbin': 1}
[270. 138.]
0
{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 546}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 324}
{'col': 4, 'cbin': 67}
{'col': 5, 'cbin': 376}
{'col': 6, 'cbin': 19}
{'col': 7, 'cbin': 1}
[127.  41.]
0
{'col': 0, 'cbin': 177}
{'col': 1, 'cbin': 652}
{'col': 2, 'cbin': 525}
{'col': 3, 'cbin': 293}
{'col': 4, 'cbin': 136}
{'col': 5, 'cbin': 394}
{'col': 6, 'cbin': 58}
{'col': 7, 'cbin': 117}
[95. 42.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 551}
{'col': 2, 'cbin': 525}
{'col': 3, 'cbin': 445}
{'col': 4, 'cbin': 118}
{'col': 5, 'cbin': 519}
{'col': 6, 'cbin': 352}
{'col': 7, 'cbin': 84}
[78. 33.]
0
{'col': 0, 'cbi

{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 829}
{'col': 2, 'cbin': 673}
{'col': 3, 'cbin': 435}
{'col': 4, 'cbin': 80}
{'col': 5, 'cbin': 489}
{'col': 6, 'cbin': 111}
{'col': 7, 'cbin': 484}
[88. 42.]
0
{'col': 0, 'cbin': 177}
{'col': 1, 'cbin': 561}
{'col': 2, 'cbin': 476}
{'col': 3, 'cbin': 314}
{'col': 4, 'cbin': 53}
{'col': 5, 'cbin': 440}
{'col': 6, 'cbin': 149}
{'col': 7, 'cbin': 17}
[110.  34.]
0
{'col': 0, 'cbin': 412}
{'col': 1, 'cbin': 490}
{'col': 2, 'cbin': 623}
{'col': 3, 'cbin': 324}
{'col': 5, 'cbin': 610}
{'col': 7, 'cbin': 184}
[45. 49.]
1
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 768}
{'col': 2, 'cbin': 673}
{'col': 3, 'cbin': 394}
{'col': 5, 'cbin': 619}
{'col': 6, 'cbin': 80}
{'col': 7, 'cbin': 100}
[87. 54.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 561}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 475}
{'col': 5, 'cbin': 553}
{'col': 6, 'cbin': 560}
{'col': 7, 'cbin': 584}
[55. 30.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 581}
{'col': 2, 'cbin': 591}
{'col': 3

{'col': 0, 'cbin': 412}
{'col': 2, 'cbin': 640}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 7, 'cbin': 334}
[285. 195.]
0
{'col': 0, 'cbin': 412}
{'col': 1, 'cbin': 576}
{'col': 2, 'cbin': 541}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 489}
{'col': 6, 'cbin': 75}
{'col': 7, 'cbin': 350}
[298. 188.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 500}
{'col': 2, 'cbin': 476}
{'col': 3, 'cbin': 102}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 379}
{'col': 6, 'cbin': 200}
{'col': 7, 'cbin': 1}
[282. 126.]
0
{'col': 0, 'cbin': 295}
{'col': 1, 'cbin': 445}
{'col': 2, 'cbin': 541}
{'col': 3, 'cbin': 213}
{'col': 4, 'cbin': 28}
{'col': 5, 'cbin': 364}
{'col': 6, 'cbin': 111}
{'col': 7, 'cbin': 150}
[57. 30.]
0
{'col': 0, 'cbin': 353}
{'col': 1, 'cbin': 405}
{'col': 2, 'cbin': 541}
{'col': 3, 'cbin': 304}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 391}
{'col': 6, 'cbin': 99}
{'col': 7, 'cbin': 334}
[219. 134.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 490}
{'col': 2, 'cbi

{'col': 0, 'cbin': 530}
{'col': 1, 'cbin': 733}
{'col': 2, 'cbin': 656}
{'col': 3, 'cbin': 465}
{'col': 4, 'cbin': 154}
{'col': 5, 'cbin': 565}
{'col': 7, 'cbin': 317}
[39. 34.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 778}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 314}
{'col': 5, 'cbin': 489}
{'col': 6, 'cbin': 66}
{'col': 7, 'cbin': 34}
[83. 39.]
0
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 516}
{'col': 2, 'cbin': 427}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 375}
{'col': 7, 'cbin': 1}
[343. 193.]
0
{'col': 0, 'cbin': 471}
{'col': 1, 'cbin': 500}
{'col': 2, 'cbin': 689}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 528}
{'col': 6, 'cbin': 131}
{'col': 7, 'cbin': 484}
[284. 182.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 561}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 132}
{'col': 4, 'cbin': 216}
{'col': 5, 'cbin': 358}
{'col': 6, 'cbin': 24}
{'col': 7, 'cbin': 34}
[119.  43.]
0
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 703}
{'col': 2, 'cbin'

In [51]:
random.randint(1,2)

1

In [52]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### doing better than baseline accuracy.

0.7012987012987013

In [53]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.69      1.00      0.82       157
          1       1.00      0.07      0.13        74

avg / total       0.79      0.70      0.60       231



In [54]:
from jakgraph import gclassifier
clf = gclassifier.gclassifier(1000)
print(clf)
clf.fit(X_train,y_train)


num of bins=1000
<ruruki.graphs.Graph object at 0x108aa29b0>
<jakgraph.gclassifier.gclassifier object at 0x108aa2240>
Hello
2
wbin1
wbin763
wbin738
wbin465
wbin1
wbin628
wbin123
wbin1
wbin59
wbin420
wbin558
wbin1
wbin1
wbin272
wbin232
wbin100
wbin118
wbin617
wbin574
wbin273
wbin1
wbin549
wbin110
wbin100
wbin236
wbin738
wbin640
wbin1
wbin1
wbin574
wbin187
wbin767
wbin1
wbin622
wbin591
wbin1
wbin1
wbin541
wbin75
wbin517
wbin295
wbin526
wbin607
wbin1
wbin1
wbin430
wbin30
wbin450
wbin412
wbin768
wbin722
wbin445
wbin1
wbin746
wbin109
wbin250
wbin59
wbin506
wbin541
wbin293
wbin232
wbin477
wbin155
wbin350
wbin530
wbin859
wbin607
wbin314
wbin1
wbin656
wbin137
wbin367
wbin706
wbin445
wbin607
wbin405
wbin64
wbin527
wbin126
wbin450
wbin236
wbin667
wbin1
wbin1
wbin1
wbin491
wbin94
wbin34
wbin1
wbin571
wbin656
wbin162
wbin1
wbin462
wbin339
wbin1
wbin1
wbin551
wbin722
wbin304
wbin1
wbin485
wbin331
wbin284
wbin353
wbin465
wbin509
wbin324
wbin149
wbin477
wbin1
wbin417
wbin471
wbin506
wbin607
wbin405
w

wbin640
wbin253
wbin218
wbin546
wbin141
wbin417
wbin177
wbin607
wbin574
wbin304
wbin160
wbin640
wbin158
wbin150
wbin648
wbin783
wbin623
wbin283
wbin178
wbin497
wbin544
wbin500
wbin1
wbin612
wbin541
wbin304
wbin196
wbin512
wbin51
wbin200
wbin177
wbin546
wbin509
wbin243
wbin1
wbin388
wbin60
wbin67
wbin412
wbin899
wbin689
wbin1
wbin1
wbin595
wbin106
wbin334
wbin353
wbin743
wbin656
wbin1
wbin1
wbin440
wbin41
wbin484
wbin1
wbin470
wbin492
wbin253
wbin109
wbin428
wbin192
wbin17
wbin177
wbin425
wbin591
wbin324
wbin1
wbin555
wbin79
wbin117
wbin589
wbin581
wbin1
wbin1
wbin1
wbin527
wbin22
wbin134
wbin59
wbin1
wbin558
wbin354
wbin1
wbin477
wbin131
wbin17
wbin59
wbin849
wbin722
wbin293
wbin1
wbin522
wbin352
wbin517
wbin118
wbin627
wbin558
wbin283
wbin243
wbin491
wbin339
wbin150
wbin118
wbin475
wbin623
wbin182
wbin79
wbin471
wbin242
wbin34
wbin59
wbin435
wbin541
wbin526
wbin77
wbin616
wbin357
wbin134
wbin412
wbin980
wbin558
wbin283
wbin1
wbin536
wbin283
wbin334
wbin530
wbin657
wbin574
wbin1
wbin1


wbin28
wbin400
wbin59
wbin637
wbin460
wbin293
wbin180
wbin428
wbin307
wbin1
wbin589
wbin455
wbin697
wbin324
wbin1
wbin521
wbin318
wbin584
wbin118
wbin450
wbin738
wbin304
wbin1
wbin500
wbin90
wbin350
wbin295
wbin490
wbin623
wbin273
wbin1
wbin531
wbin126
wbin517
wbin353
wbin531
wbin574
wbin324
wbin81
wbin460
wbin17
wbin267
wbin1
wbin692
wbin689
wbin273
wbin1
wbin407
wbin63
wbin634
wbin177
wbin798
wbin574
wbin304
wbin388
wbin530
wbin112
wbin234
wbin412
wbin985
wbin574
wbin334
wbin172
wbin375
wbin34
wbin567
wbin471
wbin637
wbin722
wbin364
wbin128
wbin574
wbin114
wbin467
wbin353
wbin495
wbin476
wbin334
wbin225
wbin507
wbin149
wbin367
wbin118
wbin622
wbin394
wbin324
wbin196
wbin628
wbin187
wbin84
wbin177
wbin394
wbin574
wbin1
wbin1
wbin485
wbin80
wbin300
wbin236
wbin677
wbin591
wbin1
wbin1
wbin355
wbin83
wbin650
wbin118
wbin703
wbin615
wbin1
wbin1
wbin382
wbin36
wbin134
wbin1
wbin511
wbin533
wbin283
wbin1
wbin367
wbin66
wbin17
wbin1
wbin905
wbin410
wbin364
wbin188
wbin564
wbin159
wbin17
wbin

wbin1
wbin443
wbin127
wbin150
wbin236
wbin713
wbin607
wbin1
wbin1
wbin412
wbin69
wbin317
wbin1
wbin596
wbin525
wbin233
wbin106
wbin1
wbin706
wbin1
wbin353
wbin834
wbin558
wbin263
wbin199
wbin501
wbin235
wbin467
wbin59
wbin743
wbin771
wbin415
wbin1
wbin735
wbin118
wbin100
wbin118
wbin506
wbin574
wbin526
wbin68
wbin604
wbin254
wbin67
wbin589
wbin475
wbin591
wbin182
wbin1
wbin345
wbin219
wbin584
wbin59
wbin1
wbin607
wbin203
wbin28
wbin413
wbin93
wbin1
wbin177
wbin445
wbin476
wbin112
wbin64
wbin370
wbin79
wbin17
wbin177
wbin571
wbin361
wbin132
wbin1
wbin334
wbin24
wbin17
wbin1
wbin692
wbin558
wbin142
wbin175
wbin370
wbin26
wbin1
wbin589
wbin849
wbin607
wbin1
wbin1
wbin567
wbin194
wbin217
wbin530
wbin864
wbin902
wbin243
wbin284
wbin677
wbin273
wbin550
wbin765
wbin536
wbin591
wbin546
wbin1
wbin546
wbin41
wbin400
wbin648
wbin697
wbin623
wbin1
wbin1
wbin495
wbin144
wbin234
wbin1
wbin738
wbin673
wbin1
wbin1
wbin604
wbin727
wbin384
wbin412
wbin672
wbin689
wbin1
wbin1
wbin600
wbin262
wbin267
wbin

In [55]:
y_pred = clf.predict(X_test)

{'col': 0, 'cbin': 59}
{'col': 2, 'cbin': 623}
{'col': 3, 'cbin': 435}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 640}
{'col': 7, 'cbin': 17}
[288. 138.]
0
{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 541}
{'col': 2, 'cbin': 607}
{'col': 3, 'cbin': 304}
{'col': 4, 'cbin': 119}
{'col': 5, 'cbin': 501}
{'col': 6, 'cbin': 137}
{'col': 7, 'cbin': 34}
[129.  47.]
0
{'col': 0, 'cbin': 236}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 507}
{'col': 7, 'cbin': 67}
[318. 189.]
0
{'col': 0, 'cbin': 295}
{'col': 1, 'cbin': 839}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 192}
{'col': 4, 'cbin': 207}
{'col': 5, 'cbin': 385}
{'col': 6, 'cbin': 216}
{'col': 7, 'cbin': 500}
[58. 34.]
0
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 561}
{'col': 2, 'cbin': 533}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 367}
{'col': 7, 'cbin': 167}
[322. 197.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 410}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 182}
{'col': 4, 'c

{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 516}
{'col': 2, 'cbin': 615}
{'col': 3, 'cbin': 233}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 1}
{'col': 6, 'cbin': 209}
{'col': 7, 'cbin': 1}
[270. 138.]
0
{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 546}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 324}
{'col': 4, 'cbin': 67}
{'col': 5, 'cbin': 376}
{'col': 6, 'cbin': 19}
{'col': 7, 'cbin': 1}
[127.  41.]
0
{'col': 0, 'cbin': 177}
{'col': 1, 'cbin': 652}
{'col': 2, 'cbin': 525}
{'col': 3, 'cbin': 293}
{'col': 4, 'cbin': 136}
{'col': 5, 'cbin': 394}
{'col': 6, 'cbin': 58}
{'col': 7, 'cbin': 117}
[95. 42.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 551}
{'col': 2, 'cbin': 525}
{'col': 3, 'cbin': 445}
{'col': 4, 'cbin': 118}
{'col': 5, 'cbin': 519}
{'col': 6, 'cbin': 352}
{'col': 7, 'cbin': 84}
[78. 33.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 632}
{'col': 2, 'cbin': 656}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 482}
{'col': 6, 'cbin': 194}
{'col': 7, 'cbin': 100}
[321. 

{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 627}
{'col': 2, 'cbin': 607}
{'col': 3, 'cbin': 364}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 415}
{'col': 7, 'cbin': 150}
[265. 146.]
0
{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 652}
{'col': 2, 'cbin': 689}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 418}
{'col': 6, 'cbin': 86}
{'col': 7, 'cbin': 100}
[343. 186.]
0
{'col': 0, 'cbin': 177}
{'col': 1, 'cbin': 571}
{'col': 2, 'cbin': 410}
{'col': 3, 'cbin': 102}
{'col': 4, 'cbin': 101}
{'col': 5, 'cbin': 440}
{'col': 6, 'cbin': 233}
{'col': 7, 'cbin': 67}
[66. 36.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 521}
{'col': 2, 'cbin': 656}
{'col': 3, 'cbin': 112}
{'col': 7, 'cbin': 17}
[140.  35.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 829}
{'col': 2, 'cbin': 673}
{'col': 3, 'cbin': 435}
{'col': 4, 'cbin': 80}
{'col': 5, 'cbin': 489}
{'col': 6, 'cbin': 111}
{'col': 7, 'cbin': 484}
[88. 42.]
0
{'col': 0, 'cbin': 177}
{'col': 1, 'cbin': 561}
{'col': 2, 'cbin': 476}
{'col': 3, 

{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 440}
{'col': 2, 'cbin': 476}
{'col': 3, 'cbin': 162}
{'col': 5, 'cbin': 488}
{'col': 6, 'cbin': 36}
{'col': 7, 'cbin': 67}
[103.  25.]
0
{'col': 0, 'cbin': 236}
{'col': 1, 'cbin': 778}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 293}
{'col': 4, 'cbin': 149}
{'col': 6, 'cbin': 109}
{'col': 7, 'cbin': 267}
[67. 35.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 531}
{'col': 2, 'cbin': 476}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 363}
{'col': 6, 'cbin': 45}
{'col': 7, 'cbin': 1}
[378. 183.]
0
{'col': 0, 'cbin': 412}
{'col': 2, 'cbin': 640}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 7, 'cbin': 334}
[285. 195.]
0
{'col': 0, 'cbin': 412}
{'col': 1, 'cbin': 576}
{'col': 2, 'cbin': 541}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 489}
{'col': 6, 'cbin': 75}
{'col': 7, 'cbin': 350}
[298. 188.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 500}
{'col': 2, 'cbin': 476}
{'col': 3, 'cbin': 102}
{'col': 4, 'cbin

{'col': 0, 'cbin': 295}
{'col': 2, 'cbin': 509}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 373}
{'col': 6, 'cbin': 216}
{'col': 7, 'cbin': 250}
[302. 187.]
0
{'col': 0, 'cbin': 1}
{'col': 1, 'cbin': 662}
{'col': 2, 'cbin': 1}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 6, 'cbin': 80}
{'col': 7, 'cbin': 84}
[343. 201.]
0
{'col': 0, 'cbin': 353}
{'col': 1, 'cbin': 627}
{'col': 2, 'cbin': 591}
{'col': 3, 'cbin': 1}
{'col': 4, 'cbin': 1}
{'col': 5, 'cbin': 412}
{'col': 6, 'cbin': 122}
{'col': 7, 'cbin': 134}
[320. 189.]
0
{'col': 0, 'cbin': 118}
{'col': 1, 'cbin': 728}
{'col': 2, 'cbin': 476}
{'col': 3, 'cbin': 334}
{'col': 4, 'cbin': 160}
{'col': 5, 'cbin': 471}
{'col': 6, 'cbin': 145}
{'col': 7, 'cbin': 67}
[108.  39.]
0
{'col': 0, 'cbin': 59}
{'col': 1, 'cbin': 551}
{'col': 2, 'cbin': 492}
{'col': 3, 'cbin': 81}
{'col': 4, 'cbin': 216}
{'col': 5, 'cbin': 379}
{'col': 7, 'cbin': 1}
[127.  37.]
0
{'col': 0, 'cbin': 177}
{'col': 1, 'cbin': 647}
{'col': 2, 'cbin'

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### doing better than baseline accuracy.

0.7012987012987013

In [57]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.69      1.00      0.82       157
          1       1.00      0.07      0.13        74

avg / total       0.79      0.70      0.60       231



In [58]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

lr = LogisticRegression();
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.7792207792207793

In [59]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.80      0.90      0.85       157
          1       0.71      0.53      0.60        74

avg / total       0.77      0.78      0.77       231



In [60]:
nb = GaussianNB();
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)
accuracy_score(y_test, y_pred)

0.7619047619047619

In [61]:
knn = KNeighborsClassifier();
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.7705627705627706

In [62]:
import ruruki_eye.server as s 

In [63]:
 #s.run("localhost",5000,True,graph)   

In [64]:
from sklearn.tree import DecisionTreeClassifier

In [65]:
dt_model = DecisionTreeClassifier(criterion = 'entropy' )

In [66]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [67]:
dt_model.score(X_test , y_test) 

0.7012987012987013

In [68]:
graph.get_edges()

<EntitySet> items: 1344