In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import seaborn as sns
from scipy.sparse import vstack, hstack, csr_matrix, save_npz, load_npz
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F

from utils.schemas import *
from utils.functions import *

In [2]:
# Skip if sparse data has already been saved

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190811.csv')

In [4]:
df_imp.head(10)

Unnamed: 0,feature,importance
0,TransactionAmt,3.0
1,card1,2.204488
2,C13,2.070567
3,N1,1.881225
4,C1,1.691782
5,card2,1.664059
6,addr1,1.501876
7,N3,1.417283
8,D15,1.38475
9,C14,1.305814


In [5]:
X_cols = df_imp[df_imp.importance > 0.1].feature.to_list()

In [6]:
data_folder = 'input'

In [7]:
train = pd.read_csv(data_folder+'/train_ft_eng_0.csv', dtype = schema_generated_0, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_0.csv', dtype = schema_generated_0, usecols=X_cols+['TransactionDT'])

In [8]:
train.shape, test.shape

((590540, 245), (506691, 244))

In [9]:
# Try pandas.cut in the future

In [10]:
m = 50000
train_sparse = vstack([csr_matrix(train[X_cols].iloc[i*m:(i+1)*m,:]) for i in range(train.shape[0] // m + 1)])
test_sparse = vstack([csr_matrix(test[X_cols].iloc[i*m:(i+1)*m,:]) for i in range(test.shape[0] // m + 1)])
print('Saving train sparse data...')
save_npz(data_folder+'/train.npz', train_sparse, compressed=True)
print('Saving test sparse data...')
save_npz(data_folder+'/test.npz',  test_sparse,  compressed=True)
print('Finished!')

Saving train sparse data...
Saving test sparse data...
Finished!


In [12]:
y_train = csr_matrix(train.isFraud)
save_npz(data_folder+'/y_train.npz',  y_train,  compressed=True)

In [33]:
y_train = torch.from_numpy(y_train.values)

In [17]:
ele_to_drop = ['train', 'test', 'train_sparse', 'test_sparse']

for ele in ele_to_drop:
    try:
        ele = eval(ele)
        del ele
        gc.collect()
    except:
        pass

In [63]:
X_train = load_npz(data_folder+'/train.npz')
X_test = load_npz(data_folder+'/train.npz')

In [64]:
Acoo = X_train.tocoo()
del X_train
gc.collect()

13413

In [65]:
X_train = torch.sparse.LongTensor(torch.LongTensor([Acoo.row.tolist(), Acoo.col.tolist()]),
                              torch.LongTensor(Acoo.data.astype(np.int32)))
del Acoo
gc.collect()

0

In [111]:
X_train = X_train.type(torch.FloatTensor)
y_train = y_train.type(torch.FloatTensor)

In [104]:
X_train.size()[1]

243

In [112]:
#our class must extend nn.Module
class MyClassifier(nn.Module):
#     def __init__(self):
#         super(MyClassifier,self).__init__()
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X_train.size()[1], 50)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(50, 100)
        self.prelu = nn.PReLU(1)
        self.out = nn.Linear(100, 1)
        self.out_act = nn.Sigmoid()
        
    def forward(self, input_):
        a1 = self.fc1(input_)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a3 = self.out(h2)
        y = self.out_act(a3)
        return y
        
    #This function takes an input and predicts the class, (0 or 1)        
    def predict(self,x):
        #Apply softmax to output. 
        pred = F.softmax(self.forward(x))
        ans = []
        #Pick the class with maximum weight
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)

In [113]:
torch.set_num_threads(8)

In [114]:
#Initialize the model        
model = MyClassifier()
model = torch.nn.DataParallel(model)
#Define loss criterion
criterion = nn.BCELoss()
#Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [115]:
#Number of epochs
epochs = 2
#List to store losses
losses = []
for i in range(epochs):
    print('Training epoch {}'.format(i))
    #Precit the output for Given input
    y_pred = model.forward(X_train)
    #Compute Cross entropy loss
    loss = criterion(y_pred,y_train)
    #Add loss to the list
    losses.append(loss.item())
    #Clear the previous gradients
    optimizer.zero_grad()
    #Compute gradients
    loss.backward()
    #Adjust weights
    optimizer.step()

Training epoch 0
Training epoch 1


In [116]:
losses

[1.0685430765151978, 1.0662041902542114]

In [94]:
torch.get_num_threads()

4

In [29]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(50, 50)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(50, 100)
        self.prelu = nn.PReLU(1)
        self.out = nn.Linear(100, 1)
        self.out_act = nn.Sigmoid()
        
    def forward(self, input_):
        a1 = self.fc1(input_)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a3 = self.out(h2)
        y = self.out_act(a3)
        return y
    
net = Net()
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.BCELoss()

In [52]:
def train_epoch(X, y, model, opt, criterion, batch_size=50):
    model.train()
    losses = []
    for beg_i in range(0, X_train.size(0), batch_size):
        x_batch = X.indices[beg_i:beg_i + batch_size,:]
        y_batch = Y.indices[beg_i:beg_i + batch_size,:]
        x_batch = Variable(x_batch)
        y_batch = Variable(y_batch)

        opt.zero_grad()
        # (1) Forward
        y_hat = model(x_batch)
        # (2) Compute diff
        loss = criterion(y_hat, y_batch)
        # (3) Compute gradients
        loss.backward()
        # (4) update weights
        opt.step()        
        losses.append(loss.data.numpy())
    return losses

In [53]:
e_losses = []
num_epochs = 20
for e in range(num_epochs):
    e_losses += train_epoch(X_train, y_train, net, opt, criterion)
plt.plot(e_losses)

TypeError: 'builtin_function_or_method' object is not subscriptable