In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from classifiers import TransparentLogisticRegression
from matplotlib import pylab as pl
from scipy.sparse import diags
from IPython import display
from scale import decision_tree_scale

In [2]:
# car
dataset = "car.csv"
class_index = 21
num_cols = 22
classes = ['acc', 'unacc']

In [3]:
read_cols = [i for i in range(num_cols) if i != class_index]
file_path = "D:\\IIT_Master\\2016 Spring\\CS597\\uci\\uci\\uci-tar\\nominal\\"+dataset

In [4]:
with open(file_path, 'r') as f:
    header = f.readline()
    #print header
    #header = np.fromstring("a, b", dtype=np.str_, sep=',')
    header = np.array(header.split(','))
    feature_names = header[read_cols]
    
    print header

['buying=vhigh' 'buying=high' 'buying=med' 'buying=low' 'maint=vhigh'
 'maint=high' 'maint=med' 'maint=low' 'doors=2' 'doors=3' 'doors=4'
 'doors=5more' 'persons=2' 'persons=4' 'persons=more' 'lug_boot=small'
 'lug_boot=med' 'lug_boot=big' 'safety=low' 'safety=med' 'safety=high'
 'class\n']


In [5]:
X = np.loadtxt(file_path, dtype=float, delimiter=",", skiprows=1, \
                   usecols=read_cols)
y = np.loadtxt(file_path, dtype=int, delimiter=",", skiprows=1, \
                   usecols=(class_index,), converters={class_index: lambda x: classes.index(x)})

X_train_, X_keep_test, y_train_, y_test_ = train_test_split(X, y, test_size=0.33, random_state=42)

X = scale(X)


In [6]:
# Determine binary features
num_features = X.shape[1]
non_binary = []
binary = []
for i in range(num_features):
    if len(np.unique(X[:,i])) != 2:
        non_binary.append(i)
    else:
        binary.append(i)


In [7]:
# Scaling the train instances


X_n = X.copy()

if len(non_binary) > 0:
    X_n[:,non_binary]=scale(X[:,non_binary])
    

# for binary features, replace zeros with -1, assuming the other values are 1; a more correct way would check if this was true.
if len(binary) > 0:
    X_b = X_n[:,binary]
    X_b[X_b == 0] = -1
    X_n[:,binary] = X_b
    
X_train, X_test, y_train, y_test = train_test_split(X_n, y, test_size=0.33, random_state=42)



In [8]:
clf = TransparentLogisticRegression()
clf.fit(X_train, y_train)

TransparentLogisticRegression(C=1.0, class_weight=None, dual=False,
               fit_intercept=True, intercept_scaling=1, max_iter=100,
               multi_class='ovr', n_jobs=1, penalty='l2',
               random_state=None, solver='liblinear', tol=0.0001,
               verbose=0, warm_start=False)

In [9]:
y_prediction = clf.predict(X_test)
neg_evi, pos_evi = clf. predict_evidences(X_test)
y_prediction_proba = clf.predict_proba(X_test)

In [10]:
# most positive, "unacc" -- Probability

Most_positive = np.argmax(y_prediction_proba[:,1])

print X_keep_test[Most_positive]

[ 0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.
  1.  0.  0.]


In [11]:
# most negative, "acc" -- Probability

Most_negative = np.argmax(y_prediction_proba[:,0])

print X_keep_test[Most_negative]

[ 0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
  0.  0.  1.]


In [12]:
# most positive, "unacc" -- Evidence 

positive_evi_index = np.argmax(pos_evi)

print X_keep_test[positive_evi_index]

[ 1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  1.  0.  0.
  1.  0.  0.]


In [13]:
# most negative, "acc" -- Evidence

negative_evi_index = np.argmax(abs(neg_evi))

# print neg_evi[]

print X_keep_test[negative_evi_index]

[ 0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
  0.  0.  1.]


In [14]:
# unc_1 Top 1 uncertain instances

uncertains = np.min(y_prediction_proba, axis=1)

uis = np.argsort(uncertains)[::-1]

top_10_uis = uis[:10]

print X_keep_test[uis[0]]

[ 0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  1.  0.
  0.  0.  1.]


In [15]:
# unc_ce from Top 10 uncertain instances
# print top_10_uis

# min the evidence, then argmax to find the most conflicted instances among top 10 uncertain instances

min_evidence_top_10 = np.min([abs(neg_evi[top_10_uis]),abs(pos_evi[top_10_uis])], axis=0 )

index_ce = np.argmax(min_evidence_top_10)

# print min_evidence_top_10
# print index_ce
# print top_10_uis[index_ce]

print X_keep_test[top_10_uis[index_ce]]

[ 1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.
  0.  0.  1.]


In [16]:
# unc_ie from Top 10 uncertain instances 

# max the nag/pos evidence, then argmin to find least conflicted instances among top 10 uncertain instances

max_evidence_top_10 = np.max([abs(neg_evi[top_10_uis]),abs(pos_evi[top_10_uis])], axis=0 )

index_ie = np.argmin(min_evidence_top_10)

# print min_evidence_top_10
# print index_ie
# print top_10_uis[index_ie]

print X_keep_test[top_10_uis[index_ie]]

[ 0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.
  0.  1.  0.]


In [17]:
# Least negative among Top 10 possitive 

top_positive_index = np.argsort(pos_evi)[::-1]

# print neg_evi[top_possitive_index]

tp = top_positive_index[:10]

# Negative information

# print pos_evi[tp]
# print neg_evi[tp]

neg_info = neg_evi[tp]

least_neg_index = np.argwhere(neg_info == np.amax(neg_info))

least_neg_index = least_neg_index.flatten().tolist()

# print neg_info[least_neg_index]

print X_keep_test[tp[least_neg_index]]

[[ 0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.
   1.  0.  0.]]


In [18]:
# Least positive among Top 10 negative instances

top_negative_index = np.argsort(abs(neg_evi))[::-1]
tp = top_negative_index[:10]

# positive information
                                
# print neg_evi[tp]
# print pos_evi[tp]

pos_info = pos_evi[tp]

least_pos_index = np.argwhere(pos_info == np.amin(pos_info))

least_pos_index = least_pos_index.flatten().tolist()

# print neg_info[least_neg_index]

print X_keep_test[tp[least_pos_index]]


[[ 0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.
   0.  0.  1.]]
