In [9]:
import pandas as pd
import numpy as np
import os


DATAPATH = 'F:/srikanth/data/k_data/AV/Mckinsay_Hackathon'

train = pd.read_csv(os.path.join(DATAPATH,'train.csv'))
test = pd.read_csv(os.path.join(DATAPATH,'test.csv'))
print('shape of train file:-', np.shape(train))
print('shape of test file:-', np.shape(test))

shape of train file:- (69713, 22)
shape of test file:- (30037, 21)


In [11]:
#train = pd.read_csv(os.path.join(DATAPATH,"train_preprocessed.csv"))
#test = pd.read_csv(os.path.join(DATAPATH,"test_preprocessed.csv"))
#labels = pd.read_csv(os.path.join(DATAPATH,"train_labels.csv"), header = None)
#test_ids = pd.read_csv(os.path.join(DATAPATH,"test_ids.csv"), header = None)
#labels = list(labels.iloc[:,0])
#train['Approved'] = labels
#train.to_csv(os.path.join(DATAPATH,"train_preprocessed_ftrl.csv"), index = False) #for faster loading

Index(['Monthly_Income', 'Existing_EMI', 'Loan_Amount', 'Loan_Period',
       'Interest_Rate', 'EMI', 'Var1', 'noofDays', 'dob_day', 'dob_dayofweek',
       ...
       'Source_S159', 'Source_S161', 'Source_Category_A', 'Source_Category_B',
       'Source_Category_C', 'Source_Category_D', 'Source_Category_E',
       'Source_Category_F', 'Source_Category_G', 'Approved'],
      dtype='object', length=237)


In [12]:
train= os.path.join(DATAPATH,"train_preprocessed_ftrl.csv")
test=os.path.join(DATAPATH,"test_preprocessed.csv")
submission = os.path.join(DATAPATH,"results","ftrl_final.csv")

In [17]:
from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt
from random import random
import pickle

#configuring model parameters
alpha = .05 
beta = 1. 
L1 = 0.
L2 = 1.

# C, feature/hash trick
D = 2 ** 24            
interaction = False 
# D, training/validation
epoch = 4       # learn training data for N passes
holdafter = 9   # data after date N (exclusive) are used as validation
holdout = 200  # use every N training instance for holdout validation


In [23]:
class ftrl_proximal(object):
    ''' Our main algorithm: Follow the regularized leader - proximal
        In short,
        this is an adaptive-learning-rate sparse logistic-regression with
        efficient L1-L2-regularization
        Reference:
        http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
    '''

    def __init__(self, alpha, beta, L1, L2, D, interaction):
        # parameters
        self.alpha = alpha
        self.beta = beta
        self.L1 = L1
        self.L2 = L2

        # feature related parameters
        self.D = D
        self.interaction = interaction

        # model
        # n: squared sum of past gradients
        # z: weights
        # w: lazy weights
        self.n = [0.] * D
        self.z = [random() for k in range(D)]#[0.] * D
        self.w = {}

    def _indices(self, x):
        ''' A helper generator that yields the indices in x
            The purpose of this generator is to make the following
            code a bit cleaner when doing feature interaction.
        '''

        # first yield index of the bias term
        yield 0

        # then yield the normal indices
        for index in x:
            yield index

        # now yield interactions (if applicable)
        if self.interaction:
            D = self.D
            L = len(x)

            x = sorted(x)
            for i in xrange(L):
                for j in xrange(i+1, L):
                    # one-hot encode interactions with hash trick
                    yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D

    def predict(self, x):
        ''' Get probability estimation on x
            INPUT:
                x: features
            OUTPUT:
                probability of p(y = 1 | x; w)
        '''

        # parameters
        alpha = self.alpha
        beta = self.beta
        L1 = self.L1
        L2 = self.L2

        # model
        n = self.n
        z = self.z
        w = {}

        # wTx is the inner product of w and x
        wTx = 0.
        for i in self._indices(x):
            sign = -1. if z[i] < 0 else 1.  # get sign of z[i]

            # build w on the fly using z and n, hence the name - lazy weights
            # we are doing this at prediction instead of update time is because
            # this allows us for not storing the complete w
            if sign * z[i] <= L1:
                # w[i] vanishes due to L1 regularization
                w[i] = 0.
            else:
                # apply prediction time L1, L2 regularization to z and get w
                w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)

            wTx += w[i]

        # cache the current w for update stage
        self.w = w

        # bounded sigmoid function, this is the probability estimation
        return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))

    def update(self, x, p, y):
        ''' Update model using x, p, y
            INPUT:
                x: feature, a list of indices
                p: click probability prediction of our model
                y: answer
            MODIFIES:
                self.n: increase by squared gradient
                self.z: weights
        '''

        # parameter
        alpha = self.alpha

        # model
        n = self.n
        z = self.z
        w = self.w

        # gradient under logloss
        g = p - y

        # update z and n
        for i in self._indices(x):
            sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
            z[i] += g - sigma * w[i]
            n[i] += g * g


def logloss(p, y):
    ''' FUNCTION: Bounded logloss
        INPUT:
            p: our prediction
            y: real answer
        OUTPUT:
            logarithmic loss of p given y
    '''

    p = max(min(p, 1. - 10e-15), 10e-15)
    return -log(p) if y == 1. else -log(1. - p)


def data(path, D):
    ''' GENERATOR: Apply hash-trick to the original csv row
                   and for simplicity, we one-hot-encode everything
        INPUT:
            path: path to training or testing file
            D: the max index that we can hash to
        YIELDS:
            ID: id of the instance, mainly useless
            x: a list of hashed and one-hot-encoded 'indices'
               we only need the index since all values are either 0 or 1
            y: y = 1 if we have a click, else we have y = 0
    '''

    for t, row in enumerate(DictReader(open(path), delimiter=',')):

        try:
            ID= row['ID']
            del row['ID']
        except:
            ID = 0
            pass

        # process target.
        y = 0.
        target='Approved'
        #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
        row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
        row['I3'] = str(row['Var1']) + str(row['Existing_EMI'])
        row['I4'] = str(row['Var1']) + str(row['noofDays'])
        #row['I5'] = str(row['Var1']) + str(row['Loan_Amount_Submitted'])
        row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
        row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
        #row['I7'] = str(row['Loan_Amount']) + str(row['Processing_Fee'])
        #row['I8'] = str(row['Var5']) + str(row['Var4'])
        row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
        #lcd_weekofyear


        if target in row:
            if row[target] == '1':
                y = 1.
            del row[target]

        # extract date

        # turn hour really into hour, it was originally YYMMDDHH


        # build x
        x = []
        for key in row:
            value = row[key]

            # one-hot encode everything with hash trick
            index = abs(hash(key + '_' + value)) % D
            x.append(index)

        yield t, ID, x, y


In [24]:
##############################################################################
# start training #############################################################
##############################################################################

start = datetime.now()
print("started at: %s" % datetime.now())

# initialize ourselves a learner
learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)

# start training
for e in range(epoch):
    loss = 0.
    count = 0
    for t, ID, x, y in data(train, D):  # data is a generator

        p = learner.predict(x)

        # if (holdout and t % holdout == 0):
        # #     # Estimate progressive validation loss
        #     loss += logloss(p, y)
        #     count += 1
        # else:
        # #     # Use other samples to train the model
        #     learner.update(x, p, y)

        learner.update(x, p, y)
        # if t % 1000000 == 0:
        #     continue

    #print('epoch: %s\tval. logloss: %0.5f\telapsed time: %s' % (e + 1, loss/count, str(datetime.now() - start)))

#import pickle
#pickle.dump(learner,open('ftrl3.p','w'))

##############################################################################
# start testing, and build Kaggle's submission file ##########################
##############################################################################
print ('creating submission file')
with open(submission, 'w') as outfile:
    outfile.write('ID,Approved\n')
    for t, ID, x, y in data(test, D):
        p = learner.predict(x)
        outfile.write('%s,%s\n' % (ID, str(p)))

started at: 2018-01-21 00:58:21.015007
creating submission file


In [30]:
preds = pd.read_csv(os.path.join(DATAPATH,"results","ftrl_final.csv"))
preds['ID'] = test['ID']
preds.to_csv("ftrl_final2.csv", index  = False)

TypeError: string indices must be integers