In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.autograd import Variable
import numpy as np

In [2]:
train_df = pd.read_csv("train.csv", nrows=1000000)
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


In [3]:
test_df = pd.read_csv("test.csv")
test_df.head()
test_df.shape[0]

18790469

In [4]:
from torch.utils.data.dataset import Dataset

class FraudDataset(Dataset):
    def __init__(self, dataset, train=True):
        self.dataset = dataset
        self.train = train
        if self.train:
            self.train_features = self.dataset.iloc[:, 1:5]
            self.train_label = self.dataset.iloc[:, 7]
        else:
            self.train_features = self.dataset.iloc[:, 2:6]
        
    def __getitem__(self, index):
        if self.train:
            # x_train=features and y_train=labels
            x_train, y_train = self.train_features.iloc[index],  self.train_label.iloc[index]
            x_train = x_train.to_dict()
            a = tuple(x_train.values())
            a = torch.Tensor(a)

            return a, y_train
        else:
            x_train = self.train_features.iloc[index]
            x_train = x_train.to_dict()
            a = tuple(x_train.values())
            a = torch.Tensor(a)
            
            return a

    def __len__(self):
        return len(self.dataset)

In [5]:
data = FraudDataset(test_df, train=False) 
a = data.__len__()
print(a)

18790469


In [6]:
# get the train data
train_data = FraudDataset(train_df)
test_data = FraudDataset(test_df, train=False)

In [7]:
# get the data loader
batch_size = 128
train_loader = torch.utils.data.DataLoader(dataset=train_data, 
                                           batch_size=batch_size, 
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_data, 
                                           batch_size=batch_size, 
                                           shuffle=False)

In [8]:
# check if loder is iterablle
import collections
isinstance(train_loader, collections.Iterable)

True

In [9]:
# # Model
class LogisticRegression(nn.Module):
    def __init__(self, input_features, output_labels):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_features, output_labels)
    
    def forward(self, x):
        out = self.linear(x)
        return out

In [10]:
learning_rate = 0.1
num_epochs = 20


model = LogisticRegression(4, 2)
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

losses = []
for epoch in range(1, num_epochs + 1):        
    for i, (x, y) in enumerate(train_loader): 

        x_train = Variable(x)
        y_train = Variable(y)

        optimizer.zero_grad()
        outputs = model(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        losses.append(loss.cpu().data[0])
        loss = np.mean(losses)

        if (i+1) % 1000 == 0:
            print ('Epoch: [%d/%d],  Loss: %.4f' 
                   % (epoch, num_epochs, loss))

Epoch: [1/20],  Loss: 7.5065
Epoch: [1/20],  Loss: 7.3989
Epoch: [1/20],  Loss: 7.9319
Epoch: [1/20],  Loss: 7.7962
Epoch: [1/20],  Loss: 8.2628
Epoch: [1/20],  Loss: 7.9650
Epoch: [1/20],  Loss: 7.7739
Epoch: [2/20],  Loss: 8.0245
Epoch: [2/20],  Loss: 7.6734
Epoch: [2/20],  Loss: 7.7821
Epoch: [2/20],  Loss: 7.6990
Epoch: [2/20],  Loss: 7.6815
Epoch: [2/20],  Loss: 7.7207
Epoch: [2/20],  Loss: 7.8538
Epoch: [3/20],  Loss: 7.8429
Epoch: [3/20],  Loss: 7.9278
Epoch: [3/20],  Loss: 7.9054
Epoch: [3/20],  Loss: 7.9489
Epoch: [3/20],  Loss: 7.8753
Epoch: [3/20],  Loss: 7.8539
Epoch: [3/20],  Loss: 7.7840
Epoch: [4/20],  Loss: 7.8197
Epoch: [4/20],  Loss: 7.8281
Epoch: [4/20],  Loss: 7.9183
Epoch: [4/20],  Loss: 7.8251
Epoch: [4/20],  Loss: 7.8128
Epoch: [4/20],  Loss: 7.7796
Epoch: [4/20],  Loss: 7.7492
Epoch: [5/20],  Loss: 7.7246
Epoch: [5/20],  Loss: 7.7664
Epoch: [5/20],  Loss: 7.7492
Epoch: [5/20],  Loss: 7.8194
Epoch: [5/20],  Loss: 7.7914
Epoch: [5/20],  Loss: 7.7662
Epoch: [5/20],

In [11]:
test_pred = pd.DataFrame()
test_pred['click_id'] = test_df['click_id'].astype('int')
test_pred.shape

(18790469, 1)

In [12]:

predicted = []

for x in test_loader:
    x = Variable(x)
    outputs = model(x)
    _, p = torch.max(outputs.data, 1)
    predicted.extend(p)
test_pred['is_attributed'] = predicted
test_pred.head()


Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [13]:
train_predicted = []

for x, y_train in train_loader:
    x = Variable(x)
    outputs = model(x)
    _, p = torch.max(outputs.data, 1)
    train_predicted.extend(p)
train_pred = pd.DataFrame()
train_pred['is_attributed'] = train_predicted
train_true = train_df['is_attributed']

# calculate the ROC AUC
# true postive vs false postitive

from sklearn.metrics import roc_auc_score
# numpy array as the inputs
roc_auc_score(train_true.values, train_pred.values)

0.4999969949123867

In [14]:
# calculate accuracy
# correct = 0
# total = 0
# def get_accuracy(data, predicted):
#     one_count = 0
#     five_count = 0
#     for y_train in data:
#         predictions = predicted[i]
#         label = y_train
#         one_count += label == predictions[0]
#         five_count += label in predictions
#     return float(one_count) / len(predicted), float(five_count) / len(predicted)


In [15]:
# get_accuracy(train_loader, predicted)

In [16]:
# calculate ROC AUC - Area under Receiver Operating characteristic curve.