In [1]:
from sklearn import tree
import csv
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
import math

In [2]:
# shuffle and split training and test sets
def splitTrainTest(splitRatio,fullData):
    if not fullData:
        dataFull = pd.read_csv("../data/filteredData.csv")
    else:
        dataFull = pd.read_csv("../data/train.csv")
        
    header = dataFull.columns
    lastIndex = len(header) - 1
    
    trainDataAttribs = pd.DataFrame(dataFull,columns = header[:lastIndex])
    targetVar = pd.DataFrame(dataFull,columns = [header[lastIndex]])
    
    X_train, X_test, y_train, y_test = train_test_split(trainDataAttribs, targetVar, test_size=splitRatio,random_state=0)
    
    return X_train, X_test, y_train, y_test

In [3]:
def treeClassifier(trainTestProp,fullDataParam = False):    
    X_train, X_test, y_train, y_test = splitTrainTest(trainTestProp,fullDataParam)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    # Logic for calculating accuracy
    correctPred = 0
    for i in range(len(predictions)):
        if predictions[i] == y_test.iat[i,0]:
            correctPred += 1
    accuracy = correctPred * 100/len(predictions)
    
    
    trueNeg = 0
    truePos = 0
    falsePos = 0
    falseNeg = 0
    
    totalPositives = 0
    totalNegatives = 0
    for i in range(len(predictions)):
        if y_test.iat[i,0] == 1:
            totalPositives += 1
        else:
            totalNegatives += 1
            
    for i in range(len(predictions)):
        # logic for calculating True Negatives
        if predictions[i] == y_test.iat[i,0] and predictions[i] == 0:
             trueNeg += 1

        # logic for calculating True positives        
        if predictions[i] == y_test.iat[i,0] and predictions[i] == 1:
             truePos += 1
        
        # logic for calculating False positives        
        if predictions[i] != y_test.iat[i,0] and predictions[i] == 1:
             falsePos += 1

        # logic for calculating False negatives        
        if predictions[i] != y_test.iat[i,0] and predictions[i] == 0:
             falseNeg += 1
      
    tpf = truePos/totalPositives
    fpf = falsePos/totalPositives
    tnf = trueNeg/totalNegatives
    fnf = falseNeg/totalNegatives
#     print("Accuracy %f" %accuracy)
#     print("True positives: %d Total positives: %d" %(truePos,totalPositives))
#     print("False positives: %d Total positives: %d" %(falsePos,totalPositives))
#     print("True negatives: %d Total negatives: %d" %(trueNeg,totalNegatives))
#     print("False negatives: %d Total negatives: %d" %(falseNeg,totalNegatives))
    
#     print("TPF %f" %tpf)
#     print("FPF %f" %fpf)
#     print("TNF %f" %tnf)
#     print("FNF %f" %fnf) 
    
    return accuracy,tpf,fpf,tnf,fnf

In [4]:
# after attribute selection
TPF = []
FPF = []
TNF = []
FNF = []
accuracy = []
for i in np.arange(0.1,1,0.1):
    acc,tpf,fpf,tnf,fnf = treeClassifier(i)
    accuracy.append(acc)
    TPF.append(tpf)
    FPF.append(fpf)
    TNF.append(tnf)
    FNF.append(fnf)
    
# plt.figure(1)
# plt.plot(np.arange(0.1,1,0.1),accuracy)

In [8]:
# calculate the area of a triangle given by points on roc curve
areaAll = []

for i in range(len(FNF)):
    start = (0,0) # A
    between = (FNF[i],TNF[i]) # B
    end = (1,1) # C

    AB = math.sqrt((start[0] - between[0])**2 + (start[1] - between[1])**2)
    BC = math.sqrt((end[0] - between[0])**2 + (end[1] - between[1])**2)
    AC = math.sqrt((start[0] - end[0])**2 + (start[1] - end[1])**2)

    s = (AB + BC + AC) / 2
    area = (s*(s-AB)*(s-BC)*(s-AC)) ** 0.5
    areaAll.append(area)


In [9]:
## Print output 
for i in range(len(accuracy)):
    print("Current split %f - %f" %((i*10)+10,(100-i*10)-10))
    print("Accuracy : %f" %(accuracy[i]))
    print("Area under the ROC curve : %f" %areaAll[i])
    print()

Current split 10.000000 - 90.000000
Accuracy : 92.291502
Area under the ROC curve : 0.459692

Current split 20.000000 - 80.000000
Accuracy : 92.745330
Area under the ROC curve : 0.462148

Current split 30.000000 - 70.000000
Accuracy : 92.747841
Area under the ROC curve : 0.462225

Current split 40.000000 - 60.000000
Accuracy : 92.768350
Area under the ROC curve : 0.462316

Current split 50.000000 - 50.000000
Accuracy : 92.636148
Area under the ROC curve : 0.461640

Current split 60.000000 - 40.000000
Accuracy : 92.802333
Area under the ROC curve : 0.462521

Current split 70.000000 - 30.000000
Accuracy : 92.547214
Area under the ROC curve : 0.461202

Current split 80.000000 - 20.000000
Accuracy : 92.582544
Area under the ROC curve : 0.461389

Current split 90.000000 - 10.000000
Accuracy : 92.370429
Area under the ROC curve : 0.460286



In [16]:
# Accuracy plot
x = np.arange(0.1,1,0.1)
z = np.polyfit(x,accuracy, 2)
f = np.poly1d(z)

x_new = np.linspace(x[0], x[-1], 50)
y_new = f(x_new)

plt.figure(1)
plt.plot(x,accuracy,'o')
plt.plot(x_new,y_new,lw=2)
plt.xlabel("Train data - Test data split")
plt.ylabel("Accuracy percentage")
plt.title("Train-test split versus accuracy, fitted to a 2 degree polynomial curve")

# Plot for negative estimations
z = np.polyfit(FNF, TNF, 2)
f = np.poly1d(z)

x_new = np.linspace(FNF[0], FNF[-1], 50)
y_new = f(x_new)

plt.figure(2)
plt.plot(FNF,TNF,'o')
plt.xlabel("False Unsatisfied Customer Fraction")
plt.ylabel("True Unsatisfied Customer Fraction")
plt.title("ROC curve candidate points for all Unsatisfied Customers")

plt.figure(3)
plt.plot(FNF,TNF,'o', x_new, y_new)
plt.plot([0, x_new[49]], [0,y_new[49]],lw=2)
plt.plot([x_new[0],1],[y_new[0],1], lw=2)
plt.plot([0, 0.5,1],[0, 0.5,1], lw=2)
plt.xlabel("False Unsatisfied Customer Fraction")
plt.ylabel("True Unsatisfied Customer Fraction")
plt.title("ROC curve - representational area under the curve, zoom to the top left to see all points")

# # Plot for positive estimations
# z = np.polyfit(FPF, TPF, 2)
# f = np.poly1d(z)

# x_new = np.linspace(FPF[0], FPF[-1], 50)
# y_new = f(x_new)

# plt.figure(3)
# plt.plot(FNF,TNF,'o', x_new, y_new)
# plt.plot([0, x_new[49]], [0,y_new[49]],lw=2)
# plt.plot([x_new[0],1],[y_new[0],1], lw=2)
# plt.plot([0, 0.5,1],[0, 0.5,1], lw=2)

plt.show()