In [2]:
from sklearn import tree
import csv
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
import math

In [3]:
# shuffle and split training and test sets
def splitTrainTest(splitRatio,fullData):
    if not fullData:
        dataFull = pd.read_csv("../data/filteredData.csv")
    else:
        dataFull = pd.read_csv("../data/train.csv")
        
    header = dataFull.columns
    lastIndex = len(header) - 1
    
    trainDataAttribs = pd.DataFrame(dataFull,columns = header[:lastIndex])
    targetVar = pd.DataFrame(dataFull,columns = [header[lastIndex]])
    
    X_train, X_test, y_train, y_test = train_test_split(trainDataAttribs, targetVar, test_size=splitRatio,random_state=0)
    
    return X_train, X_test, y_train, y_test

In [4]:
def treeClassifier(trainTestProp,fullDataParam = False):    
    X_train, X_test, y_train, y_test = splitTrainTest(trainTestProp,fullDataParam)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    # Logic for calculating accuracy
    correctPred = 0
    for i in range(len(predictions)):
        if predictions[i] == y_test.iat[i,0]:
            correctPred += 1
    accuracy = correctPred * 100/len(predictions)
    
    
    trueNeg = 0
    truePos = 0
    falsePos = 0
    falseNeg = 0
    
    totalPositives = 0
    totalNegatives = 0
    for i in range(len(predictions)):
        if y_test.iat[i,0] == 1:
            totalPositives += 1
        else:
            totalNegatives += 1
            
    for i in range(len(predictions)):
        # logic for calculating True Negatives
        if predictions[i] == y_test.iat[i,0] and predictions[i] == 0:
             trueNeg += 1

        # logic for calculating True positives        
        if predictions[i] == y_test.iat[i,0] and predictions[i] == 1:
             truePos += 1
        
        # logic for calculating False positives        
        if predictions[i] != y_test.iat[i,0] and predictions[i] == 1:
             falsePos += 1

        # logic for calculating False negatives        
        if predictions[i] != y_test.iat[i,0] and predictions[i] == 0:
             falseNeg += 1
      
    tpf = truePos/totalPositives
    fpf = falsePos/totalPositives
    tnf = trueNeg/totalNegatives
    fnf = falseNeg/totalNegatives
#     print("Accuracy %f" %accuracy)
#     print("True positives: %d Total positives: %d" %(truePos,totalPositives))
#     print("False positives: %d Total positives: %d" %(falsePos,totalPositives))
#     print("True negatives: %d Total negatives: %d" %(trueNeg,totalNegatives))
#     print("False negatives: %d Total negatives: %d" %(falseNeg,totalNegatives))
    
#     print("TPF %f" %tpf)
#     print("FPF %f" %fpf)
#     print("TNF %f" %tnf)
#     print("FNF %f" %fnf) 
    
    return accuracy,tpf,fpf,tnf,fnf

In [5]:
# after attribute selection
TPF = []
FPF = []
TNF = []
FNF = []
accuracy = []
for i in np.arange(0.1,1,0.1):
    acc,tpf,fpf,tnf,fnf = treeClassifier(i)
    accuracy.append(acc)
    TPF.append(tpf)
    FPF.append(fpf)
    TNF.append(tnf)
    FNF.append(fnf)
    
# plt.figure(1)
# plt.plot(np.arange(0.1,1,0.1),accuracy)

In [12]:
# Accuracy plot
x = np.arange(0.1,1,0.1)
z = np.polyfit(x,accuracy, 2)
f = np.poly1d(z)

x_new = np.linspace(x[0], x[-1], 50)
y_new = f(x_new)

plt.figure(1)
plt.plot(x,accuracy,'o')
plt.plot(x_new,y_new,lw=2)

# Plot for negative estimations
z = np.polyfit(FNF, TNF, 2)
f = np.poly1d(z)

x_new = np.linspace(FNF[0], FNF[-1], 50)
y_new = f(x_new)

plt.figure(2)
plt.plot(FNF,TNF,'o', x_new, y_new)
plt.plot([0, x_new[49]], [0,y_new[49]],lw=2)
plt.plot([x_new[0],1],[y_new[0],1], lw=2)
plt.plot([0, 0.5,1],[0, 0.5,1], lw=2)

# Plot for positive estimations
z = np.polyfit(FPF, TPF, 2)
f = np.poly1d(z)

x_new = np.linspace(FPF[0], FPF[-1], 50)
y_new = f(x_new)

plt.figure(3)
plt.plot(FNF,TNF,'o', x_new, y_new)
plt.plot([0, x_new[49]], [0,y_new[49]],lw=2)
plt.plot([x_new[0],1],[y_new[0],1], lw=2)
plt.plot([0, 0.5,1],[0, 0.5,1], lw=2)

plt.show()

In [23]:
# calculate the area of a triangle given by points on roc curve
for i in range(1,50):
    start = (0,0) # A
    between = (x_new[i],y_new[i]) # B
    end = (1,1) # C

    AB = math.sqrt((start[0] - between[0])**2 + (start[1] - between[1])**2)
    BC = math.sqrt((end[0] - between[0])**2 + (end[1] - between[1])**2)
    AC = math.sqrt((start[0] - end[0])**2 + (start[1] - end[1])**2)

    s = (AB + BC + AC) / 2
    area = (s*(s-AB)*(s-BC)*(s-AC)) ** 0.5
    print(area)

0.4610029380873218
0.4611272866824894
0.4612459706484743
0.4613589899852766
0.4614663446928974
0.46156803477133546
0.4616640602205914
0.4617544210406646
0.4618391172315557
0.4619181487932646
0.4619915157257915
0.4620592180291351
0.4621212557032975
0.4621776287482768
0.4622283371640744
0.46227338095068904
0.4623127601081223
0.46234647463637235
0.46237452453544026
0.4623969098053263
0.46241363044602957
0.46242468645755075
0.4624300778398898
0.46242980459304633
0.4624238667170205
0.4624122642118124
0.4623949970774223
0.46237206531384956
0.4623434689210948
0.46230920789915747
0.4622692822480378
0.4622236919677359
0.46217243705825156
0.4621155175195853
0.46205293335173664
0.461984684554705
0.4619107711284917
0.4618311930730961
0.46174595038851807
0.4616550430747577
0.461558471131815
0.4614562345596906
0.46134833335838316
0.4612347675278936
0.46111553706822184
0.4609906419793677
0.46086008226133096
0.4607238579141119
0.4605819689377109


In [11]:

plt.show()