In [None]:
import pandas as pd
import numpy as np
import pickle
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import scipy
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
with open('/content/gdrive/My Drive/Notebooks Colab/kaggleDF.pickle', 'rb') as handle:
    dfTrain, dfTest = pickle.load(handle)

In [None]:
categoricalColumns = list(dfTrain.select_dtypes(exclude='float32').columns)
numericalColumns = list(dfTrain.select_dtypes(include='float32').columns)

# Independently integer encode train and test sets

In [None]:
def factorize(train, test, col):
    if hasattr(train[col], 'cat'):
        train[col] = train[col].astype('object')
        test[col] = test[col].astype('object')
    encodedTrain, uniques = train[col].factorize(sort=True)
    # MAKE SMALLEST LABEL 1, RESERVE 0
    maxEncodedVal = encodedTrain.max()
    encodedTrain = np.where(encodedTrain == -1, maxEncodedVal + 1, encodedTrain)
    train[col] = encodedTrain
    encodingDict = {}
    for encodedVal, previousVal in enumerate(uniques):
        encodingDict[previousVal] = encodedVal
    # possibly non-exhaustvie mapping: 
    # https://stackoverflow.com/questions/42529454/using-map-for-columns-in-a-pandas-dataframe
    test[col].fillna(-1, inplace = True)
    test[col] = test[col].apply(lambda x: maxEncodedVal + 2 if x not in uniques and x != -1 else x)
    test[col] = test[col].map(encodingDict).fillna(test[col])
    # now handling the values which were not in the train set
    # just make them any integer not used already, e.g. max + 2, LGBM doesn't care
    test[col] = np.where(test[col] == -1, maxEncodedVal + 1, test[col])
    test[col] = test[col].astype('uint32')

for col in categoricalColumns:
    if col != "HasDetections":
        factorize(dfTrain, dfTest, col)
        dfTrain[col] = dfTrain[col].astype('category')
        dfTest[col] = dfTest[col].astype('category')

# Fill NaN values with the mean

In [None]:
for col in numericalColumns:
    dfTrain[col].fillna(dfTrain[col].mean(), inplace=True)
    dfTest[col].fillna(dfTrain[col].mean(), inplace=True)

# Save clean dataset

In [None]:
with open('/content/gdrive/My Drive/Notebooks Colab/cleanKaggleDF.pickle', 'wb') as handle:
    pickle.dump((dfTrain, dfTest), handle, protocol=pickle.HIGHEST_PROTOCOL)

# Split the train set on train and validation sets

In [None]:
X = dfTrain.copy().drop('HasDetections', 1)
y = dfTrain.copy()['HasDetections']
del dfTrain
xTrain, xVal, yTrain, yVal = train_test_split(X, y, test_size=0.01, stratify=y, random_state=11)
xTrain.head()

# Define the embedding dimensions

In [None]:
embeddedCols = {n: len(col.cat.categories) for n, col in dfTrain.items() if n in categoricalColumns and n!= 'HasDetections' and len(col.cat.categories) > 2}
embeddedColNames = embeddedCols.keys()
nCont = len(dfTrain.columns) - 1 - len(embeddedCols) # Number of numerical columns
embeddingSizes = [(nCategories, min(50, (nCategories + 1)//2)) for _, nCategories in embeddedCols.items()]

# Data handling utilities

In [None]:
class MalwareDataset(Dataset):
    def __init__(self, X, y, embeddedColNames):
        X = X.copy()
        self.x1 = X.loc[:,embeddedColNames].copy().values.astype(np.int64) # Categorical columns
        self.x2 = X.drop(columns=embeddedColNames).copy().values.astype(np.float32) # Numerical columns
        self.y = y.values.astype(np.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.x1[idx], self.x2[idx], self.y[idx]

In [None]:
trainDF = MalwareDataset(xTrain, yTrain, embeddedColNames)
testDF = MalwareDataset(xVal, yVal, embeddedColNames)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
def toDevice(data, device):
    if isinstance(data, (list,tuple)):
        return [toDevice(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader(DataLoader):
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl: 
            yield toDevice(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

# Model definition

In [None]:
class MalwareModel(nn.Module):
    def __init__(self, embbedingSizes, nCont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embbedingSizes])
        nEmb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
        self.nEmb, self.nCont = nEmb, nCont
        self.lin1 = nn.Linear(self.nEmb + self.nCont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, 1)
        self.bn1 = nn.BatchNorm1d(self.nCont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.embDrop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)
        

    def forward(self, xCat, xCont):
        x = [e(xCat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.embDrop(x)
        x2 = self.bn1(xCont)
        x = torch.cat([x, x2], 1)
        x = torch.relu(self.bn2(self.lin1(x)))
        x = self.drops(x)
        x = torch.relu(self.bn3(self.lin2(x)))
        x = self.drops(x)
        x = torch.sigmoid(self.lin3(x))
        return x

In [None]:
model = MalwareModel(embeddingSizes, nCont)

# Training loop functions:

In [None]:
def getOptimizer(model, lr = 1e-4, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [None]:
def trainModel(model, criterion, optim, trainDL):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in trainDL:
        batch = y.shape[0]
        output = model(x1, x2).view(-1)
        optim.zero_grad()
        loss = criterion(output, y)
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

def valLoss(model, criterion, validDL):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in validDL:
        current_batch_size = y.shape[0]
        out = model(x1, x2).view(-1)
        loss = criterion(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.round(out)
        correct += (pred == y).float().sum().item()
    print(f"Valid loss: {sum_loss/total}, Accuracy: {correct/total}")
    return sum_loss/total, correct/total

def trainLoop(model, epochs, lr=1e-4, wd=0.0):
    criterion = nn.BCEWithLogitsLoss()
    optim = getOptimizer(model, lr=lr, wd=wd)
    for i in range(epochs): 
        loss = trainModel(model, criterion, optim, trainDL)
        print("Training loss: ", loss)
        with open(f'/content/gdrive/My Drive/Notebooks Colab/DNNModel4_e{i + 30 + 1}.pickle', 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print("Model saved")
        valLoss(model, criterion, validDL)

# Training execution

In [None]:
batchSize = 1024
trainDL = DataLoader(trainDF, batch_size=batchSize, shuffle=True)
validDL = DataLoader(testDF, batch_size=batchSize, shuffle=True)
trainDL = DeviceDataLoader(trainDL, device)
validDL = DeviceDataLoader(validDL, device)

In [None]:
trainLoop(model, epochs=30, lr=1e-4, wd=1e-6)

In [None]:
with open(f'/content/gdrive/My Drive/Notebooks Colab/DNNModel.pickle', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Model saved")

# Submission

In [None]:
batchSize = 1024
testDF = MalwareDataset(dfTest, pd.Series(np.zeros(dfTest.shape[0], dtype=np.uint8)), embeddedColNames)
testDL = DataLoader(testDF, batch_size=batchSize)
testDL = DeviceDataLoader(testDL, device)
preds = []
with torch.no_grad():
    for x1, x2, y in testDL:
        prob = model(x1, x2).view(-1)
        preds += prob.cpu().detach().numpy().tolist()
yRes = np.array(preds).reshape(-1)
submission = pd.read_csv('/content/gdrive/My Drive/Notebooks Colab/sample_submission.csv')
submission['HasDetections'] = yRes
submission.to_csv('/content/gdrive/My Drive/Notebooks Colab/DNNsubmission.csv', index=False)
print(submission.shape)
submission.head()