In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split, ShuffleSplit
from classifiers import TransparentLogisticRegression, TransparentLinearRegression
from matplotlib import pylab as pl
from scipy.sparse import diags
from IPython import display
from scale import decision_tree_scale
from IPython.display import display, HTML
from ipy_table import *

In [2]:
# calculate # of instances between mean and splitting points for each feature
def get_num_instances(X, mean1, mean2):
    # m is # of instances, n is # of features, also the length of mean1 and mean2
    m, n = np.shape(X)
    result = []
    for i in range(n):
        current_feature = X[:,i]
        if mean1[i]>mean2[i]:
            a = mean2[i]
            b = mean1[i]
        else:
            a = mean1[i]
            b = mean2[i]
        num = ((current_feature > a) & (current_feature < b)).sum()
        result.append(num)
   
    return result

In [3]:
def transform(X, mns, sstd, axis=0):
    X = np.asanyarray(X)
    if axis and mns.ndim < X.ndim:

        return ((X - np.expand_dims(mns, axis=axis)) /
                    np.expand_dims(sstd, axis=axis))
    else:

        return (X - mns) / sstd

In [4]:
# combine header and table 
def combine_table(header, table):

    result = np.concatenate((header, table), axis=0)
    return result

In [5]:
#diabetes  http://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes
# ['preg' 'plas' 'pres' 'skin' 'insu' 'mass' 'pedi' 'age']
# preg: the # of pregnant 
# plas: Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
# pres: Diastolic blood pressure
# skin: Triceps skin fold thickness
# insu: 2-Hour serum insulin
# mass: Body mass index
# pedi: Diabetes pedigree function 
# Age 

dataset = "diabetes.csv"
class_index = 8
num_cols = 9
classes= ['tested_negative', 'tested_positive']
read_cols = [i for i in range(num_cols) if i != class_index]
file_path = "D:\\IIT_Master\\2016 Spring\\CS597\\uci\\uci\\uci-tar\\nominal\\"+dataset

In [6]:
with open(file_path, 'r') as f:
    header = f.readline()
    #print header
    #header = np.fromstring("a, b", dtype=np.str_, sep=',')
    header = np.array(header.split(','))
    feature_names = header[read_cols]

In [7]:
# Loading the data and splitting the train, test

X = np.loadtxt(file_path, dtype=float, delimiter=",", skiprows=1, \
                   usecols=read_cols)
y = np.loadtxt(file_path, dtype=int, delimiter=",", skiprows=1, \
                   usecols=(class_index,), converters={class_index: lambda x: classes.index(x)})

num_inst, num_feat = np.shape(X)
print "The shape of this data set:",np.shape(X)

The shape of this data set: (768L, 8L)


In [8]:
clf_ori = TransparentLogisticRegression()
clf_ss = TransparentLogisticRegression()
clf_ig = TransparentLogisticRegression()

# Original feature

X_ori = np.copy(X)

X_min_ori = np.min(X_ori, axis=0)
X_max_ori = np.max(X_ori, axis=0)

# standard scaling 

X_ss = scale(X)

X_mean_ss = np.mean(X, axis=0)
X_std_ss = np.std(X, axis=0)



# Information scaling

X_ig = clf_ig.fit_transform(X, y)

clf_ori.fit(X_ori, y)
clf_ss.fit(X_ss, y)
clf_ig.fit(X_ig, y)

print X_mean_ss
print clf_ig.mns

[   3.84505208  120.89453125   69.10546875   20.53645833   79.79947917
   31.99257812    0.4718763    33.24088542]
[   6.5     127.5      69.       31.5     121.       27.85      0.5275
   28.5   ]


In [9]:
print clf_ori.intercept_
print clf_ss.intercept_
print clf_ig.intercept_

array1 = np.array([feature_names, clf_ori.coef_[0], clf_ss.coef_[0], clf_ig.coef_[0]])

display(make_table(array1.T))

[-5.89260365]
[-0.85880534]
[-0.71586814]


0,1,2,3
preg,0.117062575265,0.407944199446,0.526243465785
plas,0.0283896431082,1.10555527346,1.13179009702
pres,-0.0168879917929,-0.250494932252,-0.249873315781
skin,0.000755653163125,0.0091394392303,0.0186791703777
insu,-0.000642933010045,-0.130888986405,-0.13907745844
mass,0.0597777103346,0.6944293345,0.774030193927
pedi,0.677457311212,0.30860250762,0.313735624065
age,0.00724197862592,0.175762070786,0.183629923141


In [10]:
X_test_ori = np.array([X_min_ori, X_max_ori])
display(make_table(X_test_ori))

0,1,2,3,4,5,6,7
0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [23]:
most_difference2 = [[2, 131, 105, 43, 85, 41.5, 0.84, 34]]

most_difference_ss2 = transform(most_difference2, X_mean_ss, X_std_ss)
most_difference_ig2 = clf_ig.transform(most_difference2)



a = clf_ss.predict_proba(most_difference_ss2)
b = clf_ig.predict_proba(most_difference_ig2)

predict_evidences_ig = clf_ig.predict_evidences(most_difference_ig2)

print predict_evidences_ig

print a 
print b 

print a[0][0] - b[0][0]

(array([-1.73315013]), array([ 1.73205965]))
[[ 0.49953394  0.50046606]]
[[ 0.50027262  0.49972738]]
-0.000738680320287


In [12]:
# 2 109 92 0 0 42.7 0.845 54 tested_negative

# 0	137	40	35	168	43.1	2.288	33	tested_positive
# 0	100	88	60	110	46.8	0.962	31	tested_negative
# 10	101	86	37	0	45.6	1.136	38	tested_positive
# 2	128	78	37	182	43.3	1.224	31	tested_positive
# 0	93	100	39	72	43.4	1.021	35	tested_negative
# 11	111	84	40	0	46.8	0.925	45	tested_positive
# 1	81	74	41	57	46.3	1.096	32	tested_negative

