# Kaggle Challenge

In [7]:
# Dictonaries that will be used to convert categorical data to integers

job_dict = {'admin.':0,'blue-collar':1,'entrepreneur':2,'housemaid':3,'management':4,'retired':5,'self-employed':6,'services':7,'student':8,'technician':9,'unemployed':10,'unknown':11}
mar_dict = {'divorced':0,'married':1,'single':2,'unknown':3}
edu_dict = {'basic.4y':0,'basic.6y':1,'basic.9y':2,'high.school':3,'illiterate':4,'professional.course':5,'university.degree':6,'unknown':7}
nyu_dict = {'no':0,'unknown':1,'yes':2}
con_dict = {'cellular':0,'telephone':1}
mon_dict = {'jan':0,'feb':1,'mar':2,'apr':3,'may':4,'jun':5,'jul':6,'aug':7,'sep':8,'oct':9,'nov':10,'dec':11}
dow_dict = {'mon':0,'tue':1,'wed':2,'thu':3,'fri':4}
poc_dict = {'failure':0,'nonexistent':1,'success':2}

import numpy as np
from sklearn.metrics import roc_auc_score

# Sanitize the data
def sanitize(dataset) :
    n = len(dataset)
    data = np.empty((n, len(dataset[0])), dtype=np.float64)
    # Convert data to float which are numeric
    data[:,0] = dataset[:,0].astype(np.float64)
    data[:,10] = dataset[:,10].astype(np.float64)
    data[:,11] = dataset[:,11].astype(np.float64)
    data[:,12] = dataset[:,12].astype(np.float64)
    data[:,14] = dataset[:,14].astype(np.float64)
    data[:,15] = dataset[:,15].astype(np.float64)
    data[:,16] = dataset[:,16].astype(np.float64)
    data[:,17] = dataset[:,17].astype(np.float64)
    data[:,18] = dataset[:,18].astype(np.float64)
    # If input has class labels then convert them too
    if (len(dataset[0]) == 20) :
        data[:,19] = dataset[:,19].astype(np.float64)
    # Replace categoricals data with their corresponding numeric values from dictionaries
    for i in xrange(n):
        data[i,1] = job_dict[dataset[i,1][1:-1]]
        data[i,2] = mar_dict[dataset[i,2][1:-1]]
        data[i,3] = edu_dict[dataset[i,3][1:-1]]
        data[i,4] = nyu_dict[dataset[i,4][1:-1]]
        data[i,5] = nyu_dict[dataset[i,5][1:-1]]
        data[i,6] = nyu_dict[dataset[i,6][1:-1]]
        data[i,7] = con_dict[dataset[i,7][1:-1]]
        data[i,8] = mon_dict[dataset[i,8][1:-1]]
        data[i,9] = dow_dict[dataset[i,9][1:-1]]
        data[i,13] = poc_dict[dataset[i,13][1:-1]]
    return data

# Initialize dataset from csv files
dataset = np.genfromtxt("trainData.csv", delimiter=',', dtype="|S50", autostrip=True)
dataset = dataset[1:]

# Seperate the dataset based on classlabels
dataset_0 = dataset[dataset[:,-1] == '0']
dataset_1 = dataset[dataset[:,-1] == '1']

# Split the dataset into train and validation set
trainset = np.concatenate((dataset_0[:4*len(dataset_0)//5], dataset_1[:4*len(dataset_1)//5]), axis=0)
validationset = np.concatenate((dataset_0[4*len(dataset_0)//5:], dataset_1[4*len(dataset_1)//5:]), axis=0)

# Sanitize the dataset
data = sanitize(dataset)
train = sanitize(trainset)
valid = sanitize(validationset)

# Validation set, sepaerate features and class labels
valid_f = valid[:,:-1]
valid_v = valid[:,-1].astype(np.int)

# Load testset and sanitize it
testset = np.genfromtxt("testData.csv",delimiter=',',dtype="|S50",autostrip=True)
testset = testset[1:]
testset = testset[:,1:]
test = sanitize(testset)

# Train classifier clf
def train_clf(clf):
    # Fit train set
    clf.fit(train[:,:-1],train[:,-1].astype(np.int))
    # Predict on validation set
    valid_p = clf.predict_proba(valid_f)
    # Calculate and print the roc auc score
    result = valid_p[:,1]
    print roc_auc_score(valid_v, result)

# Test classifier clf and print output to file
def test_clf(clf, file):
    # Fit the whole dataset
    clf.fit(data[:,:-1],data[:,-1].astype(np.int))
    # Predict on the test set
    test_p = clf.predict_proba(test)
    test_f = np.empty((len(test), 2), dtype=np.float)
    for i in xrange(len(test)) :
        test_f[i,0] = i+1
        test_f[i,1] = test_p[i,1]
    # Write output to file
    with open(file, 'wb') as f :
        f.write(b'Id,Class\n')
        np.savetxt(f,test_f,'%d,%f',delimiter=',')

In [8]:
# Submission 1

from sklearn.ensemble import GradientBoostingClassifier

clf1 = GradientBoostingClassifier(random_state=18,max_depth=7,n_estimators=100,min_samples_split=150)
train_clf(clf1)
test_clf(clf1, 'sub1.csv')

0.799823332114


In [9]:
# Submission 2

from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(criterion='entropy',max_depth=12,class_weight='balanced',min_samples_split=150,n_estimators=200,random_state=18)
train_clf(clf2)
test_clf(clf2, 'sub2.csv')

0.799305935894


# Submissions

I have done no normalizations (because doing made my score worse), converted all data to float. I have seperated dataset into train set and validation set to get an estimate of roc auc score. I have trained again on whole dataset for predicting on test set.

The classifiers used for various submissions and their corresponding roc auc scores are given below. Submissions 1 and 2 are used for calcuating scores in private leaderboard.

Submission 1 : 0.799823332114
```py
clf = GradientBoostingClassifier(random_state=18,max_depth=7,n_estimators=100,min_samples_split=150)
```
Submission 2 : 0.799305935894
```py
clf = RandomForestClassifier(criterion='entropy',max_depth=12,class_weight='balanced',min_samples_split=150,n_estimators=200,random_state=18)
```
Submission 3 : 0.792994029477
```py
clf = ExtraTreesClassifier(random_state=18,criterion='entropy',max_depth=15,n_estimators=250,min_samples_split=100,class_weight='balanced')
```
Submission 4 : 0.77123309764
```py
clf = RandomForestClassifier(criterion='entropy',max_depth=5,class_weight='balanced',min_samples_split=0.25,n_estimators=200,oob_score=True,random_state=18)
```
Submission 5 : 0.800019811691
```py
clf = GradientBoostingClassifier(random_state=18,max_depth=7,n_estimators=100,min_samples_split=0.2,min_samples_leaf=0.01)
```
Submission 6 : 0.803128077668
```py
clf = GradientBoostingClassifier(random_state=18,max_depth=9,n_estimators=150,min_samples_split=0.1,min_samples_leaf=0.01)
```