In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split, ShuffleSplit
from classifiers import TransparentLogisticRegression, TransparentLinearRegression
from matplotlib import pylab as pl
from scipy.sparse import diags
from IPython import display
from scale import decision_tree_scale
from IPython.display import display, HTML
from ipy_table import *

In [2]:
def transform(X, mns, sstd, axis=0):
    
    X = np.asanyarray(X)
    mns = np.asanyarray(mns)
    
    if axis and mns.ndim < X.ndim:

        return ((X - np.expand_dims(mns, axis=axis)) /
                    np.expand_dims(sstd, axis=axis))
    else:

        return (X - mns) / sstd

In [3]:
def sigmoid_afterscale(C):
    
    return 1/(1+np.exp(-C))

In [4]:
def sigmoid(x, mean, std, w, C=0):
    ex = np.exp(-((x-mean)/std*w + C))
    return 1/(1+ex)

In [5]:
def sigmoid_binary(x, mean, std, w, C=0):
    ex = np.exp(-(x*w + C))
    return 1/(1+ex)

In [6]:
#hepatitis
# WorKed
dataset = "hepatitis.csv"
class_index = 19
num_cols = 20
classes= ['DIE', 'LIVE']


#heart-c
dataset = "heart-c.csv"
class_index = 22
num_cols = 23
classes= ['<50', '>50_1']

#heart-statlog
# WorKed
dataset = "heart-statlog.csv"
class_index = 13
num_cols = 14
classes= ['absent', 'present']

#credit-g
dataset = "credit-g.csv"
class_index = 61
num_cols = 62
classes= ['good', 'bad']

# car
dataset = "car.csv"
class_index = 21
num_cols = 22
classes = ['acc', 'unacc']

#cmc
# WorKed
dataset = "cmc.csv"
class_index = 21
num_cols = 22
classes= ['1', '2']

#heart-statlog
# WorKed
dataset = "heart-statlog.csv"
class_index = 13
num_cols = 14
classes= ['absent', 'present']


read_cols = [i for i in range(num_cols) if i != class_index]
file_path = "D:\\IIT_Master\\2016 Spring\\CS597\\uci\\uci\\uci-tar\\nominal\\"+dataset

print dataset

heart-statlog.csv


In [7]:
X = np.loadtxt(file_path, dtype=float, delimiter=",", skiprows=1, \
                   usecols=read_cols)
y = np.loadtxt(file_path, dtype=int, delimiter=",", skiprows=1, \
                   usecols=(class_index,), converters={class_index: lambda x: classes.index(x)})

num_inst, num_feat = np.shape(X)
print "The shape of this data set:",np.shape(X)

The shape of this data set: (270L, 13L)


In [8]:
# Determine binary features
num_features = X.shape[1]
non_binary = []
binary = []
for i in range(num_features):
    if len(np.unique(X[:,i])) != 2:
        non_binary.append(i)
    else:
        binary.append(i)
        
print binary

# for binary features, replace zeros with -1, assuming the other values are 1; a more correct way would check if this was true.
if len(binary) > 0:
    X_b = X[:,binary]
    X_b[X_b == 0] = -1
    X[:,binary] = X_b
    
    
if len(non_binary) == 0:
    print "123"
    raise BaseException("There are only binary features in this data set")

[1, 5, 8]


In [9]:
clf_ori = TransparentLogisticRegression()
clf_ss = TransparentLogisticRegression()
clf_ig = TransparentLogisticRegression()

X_ori = X.copy()
X_ss = X.copy()
X_ig = X.copy()


if len(non_binary) > 0:
    X_ss[:,non_binary]=scale(X_ss[:,non_binary])

if len(non_binary) > 0:
    X_ig[:,non_binary]=clf_ig.fit_transform(X_ig[:,non_binary],y)
    
print binary
print non_binary

[1, 5, 8]
[0, 2, 3, 4, 6, 7, 9, 10, 11, 12]


In [10]:
clf_ss.fit(X_ss, y)
clf_ig.fit(X_ig, y)

TransparentLogisticRegression(C=1.0, class_weight=None, dual=False,
               fit_intercept=True, intercept_scaling=1, max_iter=100,
               multi_class='ovr', n_jobs=1, penalty='l2',
               random_state=None, solver='liblinear', tol=0.0001,
               verbose=0, warm_start=False)

In [11]:
w1 = clf_ss.coef_[0]
w2 = clf_ig.coef_[0]

print w1
print w2

bias1 = clf_ss.intercept_
bias2 = clf_ig.intercept_

print bias1, bias2

print 1/(1+np.exp(-bias1))
print 1/(1+np.exp(-bias2))
print sum(y)/float(len(y))

[-0.1313374   0.6887589   0.63326786  0.40004381  0.33209522 -0.31544318
  0.2915281  -0.45993953  0.40742727  0.3971942   0.24383086  1.01225754
  0.65280451]
[-0.11395405  0.66572658  0.66689727  0.55488815  0.32959972 -0.27715335
  0.31146151 -0.45156697  0.40935063  0.50142078  0.21940892  0.99709739
  0.6539875 ]
[-0.55941465] [-0.89807814]
[ 0.36368291]
[ 0.2894456]
0.444444444444


In [12]:
max_list = np.max(X, axis = 0 )
min_list = np.min(X, axis = 0)

print max_list
print min_list

[  77.     1.     4.   200.   564.     1.     2.   202.     1.     6.2
    3.     3.     7. ]
[  29.   -1.    1.   94.  126.   -1.    0.   71.   -1.    0.    1.    0.
    3.]


In [13]:
mean1_ = np.zeros(shape = (1, num_feat))
mean1 = mean1_.flatten()

mean2_ = np.zeros(shape = (1, num_feat))
mean2 = mean2_.flatten()


mean1[non_binary] = np.mean(X[:,non_binary], axis = 0)
mean2[non_binary] = clf_ig.mns


std1_ = np.ones(shape = (1, num_feat))
std1 = std1_.flatten()

std2_ = np.ones(shape = (1, num_feat))
std2 = std2_.flatten()



std1[non_binary] = np.std(X[:,non_binary], axis = 0)
std2[non_binary] = clf_ig.sstd

print mean1.tolist()
print mean2.tolist()

[54.43333333333333, 0.0, 3.174074074074074, 131.34444444444443, 249.65925925925927, 0.0, 1.0222222222222221, 149.67777777777778, 0.0, 1.05, 1.5851851851851853, 0.6703703703703704, 4.696296296296296]
[54.5, 0.0, 3.5, 107.0, 245.5, 0.0, 0.5, 147.5, 0.0, 1.7000000000000002, 1.5, 0.5, 4.5]


In [14]:
print non_binary

print max_list

print min_list

[0, 2, 3, 4, 6, 7, 9, 10, 11, 12]
[  77.     1.     4.   200.   564.     1.     2.   202.     1.     6.2
    3.     3.     7. ]
[  29.   -1.    1.   94.  126.   -1.    0.   71.   -1.    0.    1.    0.
    3.]


In [15]:
upper_bound = []
lower_bound = []

for i_ in range(num_feat):
    
    if i_ in non_binary:
        if mean1[i_] > mean2[i_]:
            upper_bound.append(mean1[i_])
            lower_bound.append(mean2[i_])
        else:
            upper_bound.append(mean2[i_])
            lower_bound.append(mean1[i_])
    else:
        upper_bound.append(1)
        lower_bound.append(-1)
        
print upper_bound
print lower_bound

[54.5, 1, 3.5, 131.34444444444443, 249.65925925925927, 1, 1.0222222222222221, 149.67777777777778, 1, 1.7000000000000002, 1.5851851851851853, 0.67037037037037039, 4.6962962962962962]
[54.43333333333333, -1, 3.174074074074074, 107.0, 245.5, -1, 0.5, 147.5, -1, 1.05, 1.5, 0.5, 4.5]


In [16]:
currentX = np.copy(lower_bound)
counter = 100

for j in range(counter):
    
    for i in range(num_feat):

        if i in binary:
            num_ = 2           
           
        else:
            num_ = 10
        
        xs = np.linspace(lower_bound[i], upper_bound[i] , num = num_)
        
        currentX_ss = np.copy(currentX)
        currentX_ss[non_binary] = transform(currentX[non_binary], mean1[non_binary], std1[non_binary])
        
        currentX_ig = np.copy(currentX)
        currentX_ig[non_binary] = transform(currentX[non_binary], mean2[non_binary], std2[non_binary])
        
        C1 = np.dot(currentX_ss, w1) + bias1 - currentX_ss[i] * w1[i]
        C2 = np.dot(currentX_ig, w2) + bias2 - currentX_ig[i] * w2[i]
        
        value = abs(sigmoid(xs, mean1[i], std1[i], w1[i], C1) - sigmoid(xs, mean2[i], std2[i], w2[i], C2))
    
        currentX[i] = xs[np.argmax(value)]
        
        print np.max(value), currentX

0.0127776223139 [  54.5          -1.            3.17407407  107.          245.5          -1.
    0.5         147.5          -1.            1.05          1.5           0.5
    4.5       ]
0.0207409449734 [  54.5           1.            3.17407407  107.          245.5          -1.
    0.5         147.5          -1.            1.05          1.5           0.5
    4.5       ]
0.0222619938593 [  54.5     1.      3.5   107.    245.5    -1.      0.5   147.5    -1.
    1.05    1.5     0.5     4.5 ]
0.0222619938593 [  54.5     1.      3.5   107.    245.5    -1.      0.5   147.5    -1.
    1.05    1.5     0.5     4.5 ]
0.0223835070157 [  54.5           1.            3.5         107.          249.65925926
   -1.            0.5         147.5          -1.            1.05          1.5
    0.5           4.5       ]
0.0307439905496 [  54.5           1.            3.5         107.          249.65925926
    1.            0.5         147.5          -1.            1.05          1.5
    0.5           4.5   

In [17]:
# currentX = currentX

print currentX

print currentX.tolist()

currentX_result_ss = np.copy(currentX)
currentX_result_ig = np.copy(currentX)

currentX_result_ss[non_binary] = transform(currentX[non_binary], mean1[non_binary], std1[non_binary])
currentX_result_ig[non_binary] = transform(currentX[non_binary], mean2[non_binary], std2[non_binary])

print currentX_result_ss
print currentX_result_ig


print clf_ss.predict_proba(currentX_result_ss)
print clf_ig.predict_proba(currentX_result_ig)
print clf_ss.predict_proba(currentX_result_ss) - clf_ig.predict_proba(currentX_result_ig)

[  54.5           1.            3.5         107.          249.65925926
    1.            0.5         149.67777778    1.            1.7           1.5
    0.5           4.6962963 ]
[54.5, 1.0, 3.5, 107.0, 249.65925925925927, 1.0, 0.5, 149.67777777777778, 1.0, 1.7000000000000002, 1.5, 0.5, 4.696296296296296]
[ 0.00733231  1.          0.34368445 -1.36547907  0.          1.
 -0.52429763  0.          1.          0.56863558 -0.13890753 -0.18083208
  0.        ]
[ 0.          1.          0.          0.          0.08036001  1.          0.
  0.09376826  1.          0.          0.          0.          0.10082078]
[[ 0.5627054  0.4372946]]
[[ 0.51251607  0.48748393]]
[[ 0.05018933 -0.05018933]]




In [18]:
print currentX.tolist()

[54.5, 1.0, 3.5, 107.0, 249.65925925925927, 1.0, 0.5, 149.67777777777778, 1.0, 1.7000000000000002, 1.5, 0.5, 4.696296296296296]
