In [1]:
%matplotlib inline
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
from ruruki.graphs import Graph

In [3]:
#Attribute Information:
#1. Number of times pregnant 
#2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
#3. Diastolic blood pressure (mm Hg) 
#4. Triceps skin fold thickness (mm) 
#5. 2-Hour serum insulin (mu U/ml) 
#6. Body mass index (weight in kg/(height in m)^2) 
#7. Diabetes pedigree function 
#8. Age (years) 
#9. Class variable (0 or 1) 

In [4]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
diabetes = pd.read_csv("pima-indians-diabetes.data", names=names)
diabetes.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
diabetes.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [6]:
diabetes.groupby(['class'])['class'].count()

class
0    500
1    268
Name: class, dtype: int64

In [7]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
from scipy.stats import zscore

X = diabetes[['plas','mass']]
#X = diabetes[['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi','age']].apply(zscore)
X = diabetes[['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi','age']]

In [9]:
X

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
5,5,116,74,0,0,25.6,0.201,30
6,3,78,50,32,88,31.0,0.248,26
7,10,115,0,0,0,35.3,0.134,29
8,2,197,70,45,543,30.5,0.158,53
9,8,125,96,0,0,0.0,0.232,54


In [10]:
y = diabetes['class']

In [11]:
y

0      1
1      0
2      1
3      0
4      1
5      0
6      1
7      0
8      1
9      1
10     0
11     1
12     0
13     1
14     1
15     1
16     1
17     1
18     0
19     1
20     0
21     0
22     1
23     1
24     1
25     1
26     1
27     0
28     0
29     0
      ..
738    0
739    1
740    1
741    0
742    0
743    1
744    0
745    0
746    1
747    0
748    1
749    1
750    1
751    0
752    0
753    1
754    1
755    1
756    0
757    1
758    0
759    1
760    0
761    1
762    0
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 768, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [13]:
import numpy as np
X_train = np.array(X_train)

In [14]:
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [15]:
min = X_train.min(axis=0)

In [16]:
min[0]

0.0

In [17]:
max = X_train.max(axis=0)

In [18]:
max[0]

17.0

In [19]:
cols = X_train.shape[-1]

In [20]:
cols

8

In [21]:
from jakgraph import gclassifier
clf = gclassifier.gclassifier(1000)
print(clf)
clf.fit(X_train,y_train)


num of bins=1000
<ruruki.graphs.Graph object at 0x108518a20>
gclassifier(nbins=1000)
2


In [22]:
y_pred = clf.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### doing better than baseline accuracy.

0.7012987012987013

In [24]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.69      1.00      0.82       157
          1       1.00      0.07      0.13        74

avg / total       0.79      0.70      0.60       231



In [25]:
from sklearn.ensemble import BaggingClassifier

bg = BaggingClassifier(gclassifier.gclassifier(1000),max_samples=0.5,n_estimators=5)
print(bg)
bg.fit(X_train,y_train)

num of bins=1000
<ruruki.graphs.Graph object at 0x10886b438>
BaggingClassifier(base_estimator=gclassifier(nbins=1000), bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=0.5,
         n_estimators=5, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)
num of bins=1000
<ruruki.graphs.Graph object at 0x108b74198>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x108ec3ac8>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x1091f0400>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x1094ffcf8>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10982d630>
2


BaggingClassifier(base_estimator=gclassifier(nbins=1000), bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=0.5,
         n_estimators=5, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [26]:
y_pred = bg.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### doing better than baseline accuracy.

0.7012987012987013

In [28]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.69      1.00      0.82       157
          1       1.00      0.07      0.13        74

avg / total       0.79      0.70      0.60       231



In [29]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 105, criterion = 'entropy')
rf_model = rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### doing better than baseline accuracy.

0.7792207792207793

In [30]:
from sklearn.ensemble import VotingClassifier

vt = VotingClassifier(estimators=[('jk',gclassifier.gclassifier(1000)),('rl',RandomForestClassifier(n_estimators = 105, criterion = 'entropy'))],voting='hard')
vt.fit(X_train, y_train)
y_pred = vt.predict(X_test)
accuracy_score(y_test, y_pred)

num of bins=1000
<ruruki.graphs.Graph object at 0x109b6ca58>
num of bins=1000
<ruruki.graphs.Graph object at 0x109b6ce48>
2


  if diff:


0.6926406926406926

In [31]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.69      1.00      0.82       157
          1       1.00      0.04      0.08        74

avg / total       0.79      0.69      0.58       231



In [32]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'entropy' )

bg = BaggingClassifier(dt_model,max_samples=0.5,n_estimators=20)
print(bg)
bg.fit(X_train,y_train)
y_pred = bg.predict(X_test)
accuracy_score(y_test, y_pred)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=20, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)


0.8008658008658008

In [33]:
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('JK',gclassifier.gclassifier(1000)))

vt = VotingClassifier(estimators=models,voting='hard')
vt.fit(X_train, y_train)
y_pred = vt.predict(X_test)
accuracy_score(y_test, y_pred)

num of bins=1000
<ruruki.graphs.Graph object at 0x10a16b400>
num of bins=1000
<ruruki.graphs.Graph object at 0x109e8ac88>
2


  if diff:


0.7835497835497836

In [34]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.79      0.93      0.85       157
          1       0.76      0.47      0.58        74

avg / total       0.78      0.78      0.77       231



In [35]:
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
#models.append(('JK',gclassifier.gclassifier(1000)))

vt = VotingClassifier(estimators=models,voting='hard')
vt.fit(X_train, y_train)
y_pred = vt.predict(X_test)
accuracy_score(y_test, y_pred)

  if diff:


0.7878787878787878

In [36]:
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.79      0.93      0.86       157
          1       0.77      0.49      0.60        74

avg / total       0.78      0.79      0.77       231



In [37]:

from sklearn.ensemble import AdaBoostClassifier
from jakgraph import gclassifier

abc = AdaBoostClassifier(gclassifier.gclassifier(1000),algorithm='SAMME',random_state=1)
print(abc)
abc.fit(X_train,y_train,np.random.uniform(low=0.5, high=13.3, size=(len(y_train),)))

num of bins=1000
<ruruki.graphs.Graph object at 0x10a446b00>
AdaBoostClassifier(algorithm='SAMME', base_estimator=gclassifier(nbins=1000),
          learning_rate=1.0, n_estimators=50, random_state=1)
num of bins=1000
<ruruki.graphs.Graph object at 0x10a4469b0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10a7408d0>
2


AdaBoostClassifier(algorithm='SAMME', base_estimator=gclassifier(nbins=1000),
          learning_rate=1.0, n_estimators=50, random_state=1)

In [38]:
y_pred = abc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7012987012987013

In [39]:

from sklearn.ensemble import AdaBoostClassifier
from jakgraph import gclassifier

abc = AdaBoostClassifier(gclassifier.gclassifier(1000),algorithm='SAMME',random_state=1)
print(abc)
abc.fit(X_train,y_train)

num of bins=1000
<ruruki.graphs.Graph object at 0x109b46d68>
AdaBoostClassifier(algorithm='SAMME', base_estimator=gclassifier(nbins=1000),
          learning_rate=1.0, n_estimators=50, random_state=1)
num of bins=1000
<ruruki.graphs.Graph object at 0x10aa6d208>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10a42f940>
2


AdaBoostClassifier(algorithm='SAMME', base_estimator=gclassifier(nbins=1000),
          learning_rate=1.0, n_estimators=50, random_state=1)

In [40]:
abc.score(X_train,y_train)

0.6685288640595903

In [41]:
y_pred = abc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7012987012987013

In [42]:
abc.score(X_test,y_test)

0.7012987012987013

In [43]:
X = X.as_matrix()
y = y.as_matrix()

In [44]:
from sklearn.model_selection import KFold # import KFold

kf = KFold(n_splits=20) # Define the split - into 2 folds 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator

20

In [45]:
best=0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    bg = BaggingClassifier(gclassifier.gclassifier(1000),max_samples=0.5,n_estimators=10)
    bg.fit(X_train,y_train)
    y_pred = bg.predict(X_test)
    if(best < accuracy_score(y_test, y_pred)):
        best = accuracy_score(y_test, y_pred)
        best_model = clf
    print(accuracy_score(y_test, y_pred))

num of bins=1000
<ruruki.graphs.Graph object at 0x109bbd7f0>
num of bins=1000
<ruruki.graphs.Graph object at 0x109ebb2b0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10a9aacf8>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10ae704e0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10b219c88>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10a1e39b0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10b6e82b0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10bab0a58>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10be97240>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10c25f8d0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10c6f21d0>
2
0.46153846153846156
num of bins=1000
<ruruki.graphs.Graph object at 0x109ebb240>
num of bins=1000
<ruruki.graphs.Graph object at 0x10b6e8390>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10ccaae80>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10d090588>
2
num of bins=1000
<ruruki.graphs.Graph o

num of bins=1000
<ruruki.graphs.Graph object at 0x10a8fd518>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10638c2e8>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10a8922e8>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10aff81d0>
2
0.7105263157894737
num of bins=1000
<ruruki.graphs.Graph object at 0x10c677860>
num of bins=1000
<ruruki.graphs.Graph object at 0x10a8fd630>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10b94a940>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10ae83630>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x106113b00>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x106303588>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10b268fd0>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10b1aba58>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x102bd33c8>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x109e1df28>
2
num of bins=1000
<ruruki.graphs.Graph object at 0x10aedc9b0>
2
0.8157894736842105
num of bins=1000
<r

In [46]:
best

0.8157894736842105