In [None]:
import pandas as pd
import torch 
from torch import nn, optim
import random
from sklearn.metrics import roc_auc_score

# Load Data

In [None]:
random.seed(5566)
num_lines = 40428968
sample_size = 100000
skip = sorted(random.sample(range(1, num_lines), num_lines-sample_size))
df = pd.read_csv("../input/avazu-ctr-prediction/train.gz", header=0, skiprows=skip)
df = df.dropna()

eval_ratio = 0.1
idx = list(range(len(df)))
random.seed(1)
random.shuffle(idx)
train_idx = idx[int(len(df)*eval_ratio):]
eval_idx = idx[:int(len(df)*eval_ratio)]
train_df = df.iloc[train_idx]
eval_df = df.iloc[eval_idx]

#test_df = pd.read_csv("../input/avazu-ctr-prediction/test.gz", header=0, skiprows=skip)
#test_df = test_df.dropna()

In [None]:
train_df.head()

In [None]:
for name in train_df.columns:
    print("%s: %s" % (name, len(train_df[name].unique())))

# Feature Engineering

In [None]:
keep_features = ['hour', 'C1', 'banner_pos', 'site_category', 'app_domain', 'app_category', 
                 'device_type', 'device_conn_type', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
C1_values = train_df.C1.unique()
C1_map = {}
def get_maps(names):
    """return a dict of feature names and value maps"""
    result = {}
    for name in names:
        dic = {}
        values = train_df[name].unique()
        for v in values:
            dic[v] = len(dic)
        result[name] = dic
    return result

maps = get_maps(keep_features)

def trans_feature_by_name(x, name):
    if name == "hour":
        return int(str(x)[-2:])
    elif x in maps[name]:
        return maps[name][x]
    else:
        return len(maps[name])

for name in keep_features:
    train_df[name+"_trans"] = train_df[name].map(lambda x: trans_feature_by_name(x, name))
    eval_df[name+"_trans"] = eval_df[name].map(lambda x: trans_feature_by_name(x, name))
trans_features = [name+"_trans" for name in keep_features]

train_features_matrix = train_df[trans_features]
train_labels = train_df["click"]

eval_features_matrix = eval_df[trans_features]
eval_labels = eval_df["click"]

# Build Model

In [None]:
class CustomModel(nn.Module):
    def __init__(self, embed_info):
        super().__init__()
        self.feature_names = []
        for name, n, dim in embed_info:
            setattr(self, "layer_"+name, nn.Embedding(n, dim))
            self.feature_names.append(name)
        self.linear1 = nn.Linear(sum([v for _, _, v in embed_info]), 125)
        self.linear2 = nn.Linear(125, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, features):
        embeds = []
        for name in self.feature_names:
            embed = getattr(self, "layer_" + name)(torch.tensor(features[name]))
            embeds.append(embed)
        embeds_cat = torch.cat(embeds, axis=-1)
        out = self.linear1(embeds_cat)
        out = self.linear2(out)
        probs = self.sigmoid(out)
        return torch.flatten(probs)

# Train Model

In [None]:
BS=256
lr=1e-3
epochs=5
embed_info = [('hour_trans', 24, 6), ('C1_trans', 8, 3), ('banner_pos_trans', 8, 3), ('site_category_trans', 20, 6), 
              ('app_domain_trans', 124, 30), ('app_category_trans', 25, 7), 
                 ('device_type_trans', 6, 3), ('device_conn_type_trans', 5, 3), ('C15_trans', 9, 4),
              ('C16_trans', 10, 4), ('C17_trans', 409, 50), ('C18_trans', 5, 3), ('C19_trans', 67, 15), ('C20_trans', 158, 35), ('C21_trans', 61, 20)]
model = CustomModel(embed_info)
loss_fn = nn.BCELoss()
opt = optim.Adam(model.parameters(), lr=lr)
best = 0
for e in range(1, epochs+1):
    print("epoch " + str(e) + ": ")
    model.train()
    for i in range(int(len(train_features_matrix)/BS)):
        output = model(train_features_matrix.iloc[i*BS:(i+1)*BS].to_dict('list'))
        target = train_labels[i*BS:(i+1)*BS]
        loss = loss_fn(output, torch.tensor(target.values, dtype=torch.float32))
        loss.backward()
        opt.step()
    model.eval()
    probs = model(eval_features_matrix.to_dict('list'))
    score = roc_auc_score(eval_labels, probs.detach().numpy())
    score = round(score, 4)
    if score > best:
        print("find better score: %s, old score: %s" % (score, best))
        best = score
    else:
        print("current score: %s, best score: %s" % (score, best))