In [1]:
from models import gru_att_age_v2 as age_model,  gru_att_gender_v2 as gender_model
import torch
from tqdm import tqdm
import time
from datetime import timedelta
import random
import numpy as np
%load_ext autoreload
%autoreload 2

In [2]:
def load_dataset(path, pad_size):
    datas = []
    uids = []
    with open(path, 'r') as f:
        for line in tqdm(f):
            line = line.strip()
            if not line:
                continue
            seqences = line.split()
            uid = int(seqences[0].split('_')[0])
            uids.append(uid)
            arr = np.array([list(map(int, x.split('_'))) for x in seqences[1:]], dtype=int)
            if  arr.shape[1] < pad_size:
                height, width = arr.shape[0], (pad_size - arr.shape[1])
                arr = np.concatenate([arr, np.zeros((height, width), dtype=int)], axis=1)
            else:
                arr = arr[:, :pad_size]
            datas.append((
                arr[0], arr[1], arr[2], arr[3], arr[4], arr[5]
                ))
    return np.array(uids, dtype=int), datas

In [4]:
class DatasetIterater(object):

    def __init__(self, dataset, config):
        self.dataset = dataset
        self.batch_size = config.batch_size
        self.index = 0
        self.dataset_len = len(dataset)
        self.device = config.device
        self.pad_size = config.pad_size
        self.config = config

    def _to_tensor(self, batch_data):
        
        # embedding 特征
        # creative_id = torch.LongTensor([x[0] for x in batch_data]).to(self.device)
        ad_id = torch.LongTensor([x[1] for x in batch_data]).to(self.device)
        product_id = torch.LongTensor([x[2] for x in batch_data]).to(self.device)
        product_category = torch.LongTensor([x[3] for x in batch_data]).to(self.device)
        advertiser_id = torch.LongTensor([x[4] for x in batch_data]).to(self.device)
        industry = torch.LongTensor([x[5] for x in batch_data]).to(self.device)
        
        return ad_id, product_id, product_category, advertiser_id, industry

    def __next__(self):
        if self.index < self.dataset_len:
            batch_data = self.dataset[self.index: self.index + self.batch_size]
            self.index += self.batch_size
            return self._to_tensor(batch_data)
        else:
            self.index = 0
            # 每个epoch随机shuffle数据集
            raise StopIteration

    def __iter__(self):
        return self

    def __len__(self):
        n_batches = len(self.dataset) // self.batch_size
        if len(self.dataset) % self.batch_size == 0:
            return n_batches
        else:
            return n_batches + 1

In [5]:
config = age_model.Config()
model = age_model.Model(config)
model.load_state_dict(torch.load(config.save_path))
model.to(config.device)

100%|██████████| 579845/579845 [00:01<00:00, 323684.89it/s]
100%|██████████| 11106/11106 [00:00<00:00, 311287.13it/s]
100%|██████████| 24364/24364 [00:00<00:00, 286545.95it/s]


Model(
  (ad_id_embedding): Embedding(579847, 256, padding_idx=0)
  (product_id_embedding): Embedding(11108, 256, padding_idx=0)
  (product_category_embedding): Embedding(20, 8, padding_idx=0)
  (advertiser_id_embedding): Embedding(24366, 256, padding_idx=0)
  (industry_embedding): Embedding(302, 32, padding_idx=0)
  (gru): GRU(808, 128, batch_first=True, bidirectional=True)
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01, inplace)
    (3): Linear(in_features=128, out_features=10, bias=True)
  )
)

In [6]:
def predict(model, test_iter):
    model.eval()
    predict_all = np.array([], dtype=int)
    with torch.no_grad():
        for x in tqdm(test_iter):
            out = model(x)
            predict = torch.max(out.data, 1)[1].cpu().numpy()
            predict_all = np.append(predict_all, predict)
    return predict_all + 1

In [7]:
uids, dataset = load_dataset('./datas/test.csv', config.pad_size)

999999it [01:50, 9048.54it/s] 


In [8]:
test_iter = DatasetIterater(dataset, config)
age_predict = predict(model, test_iter)

100%|██████████| 7813/7813 [01:56<00:00, 67.33it/s]


In [9]:
age_predict[:10]

array([ 3,  7,  3,  3,  4,  4, 10,  3,  2, 10])

In [8]:
age_predict[:10]

array([ 3,  7,  4,  3,  4,  3, 10,  3,  2, 10])

In [42]:
age_predict[:10]

array([ 3,  7,  2,  3,  4,  4,  9,  3,  2, 10])

In [10]:
config = gender_model.Config()
model = gender_model.Model(config)
model.load_state_dict(torch.load(config.save_path))
model.to(config.device)
test_iter = DatasetIterater(dataset, config)
gender_predict = predict(model, test_iter)

100%|██████████| 579845/579845 [00:01<00:00, 331424.44it/s]
100%|██████████| 11106/11106 [00:00<00:00, 321858.52it/s]
100%|██████████| 24364/24364 [00:00<00:00, 317532.89it/s]
100%|██████████| 7813/7813 [01:52<00:00, 69.27it/s]


In [11]:
len(uids), len(age_predict), len(gender_predict)    

(999999, 999999, 999999)

In [12]:
age_predict[:10], gender_predict[:10]

(array([ 3,  7,  3,  3,  4,  4, 10,  3,  2, 10]),
 array([1, 2, 2, 1, 1, 1, 1, 1, 1, 2]))

In [13]:
# uids = np.array(uids)
result = np.concatenate([uids[:, None], age_predict[:,None], gender_predict[:,None]], axis=1)

In [14]:
import pandas as pd

In [15]:
columns = ['user_id', 'predicted_age', 'predicted_gender']
result_df = pd.DataFrame(result, columns=columns)
result_df.head()

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3000001,3,1
1,3000002,7,2
2,3000003,3,2
3,3000004,3,1
4,3000005,4,1


In [16]:
rule_uids = {3086425}
tmp = []
for uid in rule_uids:
    tmp.append([uid, 3, 1])
rule_df = pd.DataFrame(tmp, columns=columns)
rule_df.head()

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3086425,3,1


In [17]:
result_df = pd.concat([result_df, rule_df], axis=0)
result_df.shape

(1000000, 3)

In [20]:
result_df.head()

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3000001,3,1
1,3000002,7,2
2,3000003,3,2
3,3000004,3,1
4,3000005,4,1


In [19]:
result_df = result_df.astype(int)

In [21]:
result_df.to_csv('./submission/submission_gru_0525.csv', index=False)