In [1]:
import csv
import numpy as np
import math
from sklearn import svm
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Read csv to list
with open('bank-full.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=';')
    content = []
    for row in reader:
        content.append(row)
attributes = content[0]
content = content[1:]
print 'Attributes:', attributes
print 'Number of data', len(content)

Attributes: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
Number of data 45211


In [7]:
len([x for x in content if x[16] == 'yes' ])

5289

In [8]:
# Encode data
data = []
for row in content:
    thisData = {}
    for i, att in enumerate(row):
        if attributes[i] in set(['age','balance','day','duration','campaign','pdays','previous']):
            thisData[attributes[i]] = int(att)
        elif attributes[i] == 'month': # jan to dec
             thisData[attributes[i]] = att
        elif attributes[i] in ['default', 'housing', 'loan', 'y']: #['yes', 'no']
            thisData[attributes[i]] = 1 if att == 'yes' else 0
        elif attributes[i] == 'job': #['management', 'retired', 'self-employed', 'unknown', 'unemployed', 'housemaid', 'admin.', 'entrepreneur', 'services', 'student', 'technician', 'blue-collar']
            thisData[attributes[i]] = att
        elif attributes[i] == 'marital': #['single', 'married', 'divorced']
            thisData[attributes[i]] = att
        elif attributes[i] == 'education': #['unknown', 'primary', 'tertiary', 'secondary']
            thisData[attributes[i]] = att 
        elif attributes[i] == 'contact':# ['unknown', 'telephone', 'cellular']
            thisData[attributes[i]] = att
        elif attributes[i] == 'poutcome': #['unknown', 'other', 'success', 'failure']
            thisData[attributes[i]] = att
        else:
            print 'error'
    data.append(thisData)

In [9]:
# Split data to 6:2:2
np.random.seed(0)
np.random.shuffle(data)
trainingData = data[:int(len(data)*0.6)]
validationData = data[int(len(data)*0.6):int(len(data)*0.8)]
testData = data[int(len(data)*0.8):]

In [86]:
len([x for x in trainingData if x['y'] == 1 ]) * 1. / len(trainingData)

0.11597729115977291

In [98]:
len([x for x in content if x[16] == 'yes' ])

5289

In [12]:
educationDic = {'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3}
def encodeEducation(education):
    feat = [0]*4
    feat[educationDic[education]] = 1
    return feat

In [13]:
maritalDic = {'single':0, 'married':1, 'divorced':2}
def encodeMarital(marital):
    feat = [0]*3
    feat[maritalDic[marital]] = 1
    return feat

In [14]:
jobs = list(set([d['job'] for d in data]))
jobDic = {j:i for i,j in enumerate(jobs)}
def encodeJob(job):
    feat = [0]*len(jobs)
    feat[jobDic[job]] = 1
    return feat

In [15]:
contactDic = {'unknown':0, 'telephone':1, 'cellular':2}
def encodeContact(contact):
    feat = [0] * 3
    feat[contactDic[contact]] = 1
    return feat

In [287]:
import calendar
monthDic = {v.lower(): k for k,v in enumerate(calendar.month_abbr)}
def encodeMonth(month):
    feat = [0] * 13
    feat[monthDic[month]] = 1
    return feat

In [288]:
outcomeDic = {'unknown':0, 'other':1, 'failure':2, 'success':3}
def encodeOutcome(outcome):
    feat = [0] * 4
    feat[outcomeDic[outcome]] = 1
    return feat

In [294]:
def encodeAge(age):
    if age <= 25 or age >= 60:
        return [1]
    else:
        return [0]

In [311]:
def generateFeature(dataset):
    features = []
    for data in dataset:
#         feat = [data['age']] + encodeJob(data['job']) + encodeMarital(data['marital']) + encodeEducation(data['education']) \
#             + [data['default'], data['balance'], data['housing'], data['loan']] \
#         + encodeContact(data['contact']) + [data['day']] + encodeMonth(data['month']) + [data['duration']] \
#         + [data['campaign'], data['pdays'], data['previous']] + encodeOutcome(data['poutcome'])
        
        feat = [data['age'], data['duration'], data['previous'], data['pdays'], data['campaign']] + encodeMonth(data['month']) + encodeOutcome(data['poutcome'])

        features.append(feat)
    return features

In [304]:
cwList = []
trainList = []
validList = []
def errorRate(pred, label):
    truePositives = len(filter(lambda x: x[0] == x[1] == True, zip(pred, label)))
    trueNegatives = len(filter(lambda x: x[0] == x[1] == False, zip(pred, label)))
    falsePositives = len(filter(lambda x: x[0] == True and x[1] == False, zip(pred, label)))
    falseNegatives = len(filter(lambda x: x[0] == False and x[1] == True, zip(pred, label)))
    truePositiveRate = truePositives * 1. / (truePositives + falseNegatives)
    trueNegativeRate = trueNegatives * 1. / (trueNegatives + falsePositives)
    balanceErrorRate = 1 - 0.5 * (truePositiveRate + trueNegativeRate)
    print "True Positives:", truePositives
    print "True Negatives:", trueNegatives
    print "False Positives:", falsePositives
    print "False Negatives:", falseNegatives
    print "True Positive Rate:", truePositiveRate
    print "True Negative Rate:", trueNegativeRate
    print "Balanced Error Rate:", balanceErrorRate
    return truePositives, trueNegatives, falsePositives, falseNegatives, truePositiveRate, trueNegativeRate, balanceErrorRate

In [305]:
X_train = generateFeature(trainingData)
y_train = [d['y'] for d in trainingData]
X_validation = generateFeature(validationData)
y_validation = [d['y'] for d in validationData]
X_test = generateFeature(testData)
y_test = [d['y'] for d in testData]

In [306]:
clf = svm.SVC(C=0.1, class_weight = {0:1, 1:5})
# from sklearn import linear_model
# clf = linear_model.Ridge(1.0, fit_intercept = True)
# clf = linear_model.LinearRegression(fit_intercept=True)
clf.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight={0: 1, 1: 5}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [307]:
trainingPredict = clf.predict(X_train)

In [308]:
validationPredict = clf.predict(X_validation)

In [309]:
errorRate(validationPredict, y_validation)

True Positives: 0
True Negatives: 7972
False Positives: 0
False Negatives: 1070
True Positive Rate: 0.0
True Negative Rate: 1.0
Balanced Error Rate: 0.5


(0, 7972, 0, 1070, 0.0, 1.0, 0.5)

In [310]:
trainList.append(errorRate(trainingPredict, y_train))
validList.append(errorRate(validationPredict, y_validation))

True Positives: 1
True Negatives: 23980
False Positives: 0
False Negatives: 3145
True Positive Rate: 0.000317863954228
True Negative Rate: 1.0
Balanced Error Rate: 0.499841068023
True Positives: 0
True Negatives: 7972
False Positives: 0
False Negatives: 1070
True Positive Rate: 0.0
True Negative Rate: 1.0
Balanced Error Rate: 0.5


In [200]:
len([x for x in content if x[6] == 'yes']) * 1. / len(content)

0.5558381809736569

In [289]:
trainingData[0]['age'] < 20

False

In [312]:
len([1,2,4,])

3