The MIT License (MIT)

Copyright (c) 2021 NVIDIA CORPORATION

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Ensemble MLP-SMF, GRU-MS-SMF and XLNet-SMF

In [None]:
import os, sys
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# ENSEMBLE VERSION NUMBER
VER_E = 22

# ENSEMBLE
TRANSFORMER_PATH = '</path/to/xlnet-smf-output>'
MLP_PATH = '../02_Models/MLP_SMF/'
GRU_PATH = '</path/to/gru-sm-smf-output>'

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
pd.set_option('display.max_columns', None)

pd.__version__

# Metric Calculation

In [None]:
# Return top4 metric
# istest: flag to select if metric should be computed in 0:train, 1:test, -1:both
# pos: select which city to calculate the metric, 0: last, 1: last-1, 2:last-2 , -1: all
# the input `val` dataframe must contains the target `city_id` and the 4 recommendations as: rec0, res1, rec2 and rec3

def top4_metric( val, istest=0, pos=0 , target='city_id'):
    
    if istest>=0:
        val = val.loc[ (val.submission==0) & (val.istest == istest) ]
    else:
        val = val.loc[ (val.submission==0) ]

    if pos >= 0:
        top1 = val.loc[val.icount==pos,target] == val.loc[val.icount==pos,'rec0']
        top2 = val.loc[val.icount==pos,target] == val.loc[val.icount==pos,'rec1']
        top3 = val.loc[val.icount==pos,target] == val.loc[val.icount==pos,'rec2']
        top4 = val.loc[val.icount==pos,target] == val.loc[val.icount==pos,'rec3']
    else:
        top1 = val[target] == val['rec0']
        top2 = val[target] == val['rec1']
        top3 = val[target] == val['rec2']
        top4 = val[target] == val['rec3']
        
    return (top1|top2|top3|top4).mean()    

# Load Data

In [None]:
%%time
raw = pd.read_csv('../00_Data/train_and_test.csv')
print(raw.shape)

In [None]:
t_ct = raw.city_id.max()+1
t_ct

# Predict Full OOF

In [None]:
# XLNet-SMF MODEL SOFTMAX OOF
TOP_K = 50

dfs = []
for k in range(5):
    df = pd.read_csv(f'{TRANSFORMER_PATH}bag_{k}/valid_preds_oof.csv')
    df = df.set_index('utrip_id')
    df = df[ [f'city_id_{x}' for x in range(1,TOP_K+1)] + [f'city_score_{x}' for x in range(1,TOP_K+1)] ]
    dfs.append(df)
dfs[0].head()

In [None]:
# MLP-SMF MODEL SOFTMAX OOF
df2 = pd.read_csv(f'{MLP_PATH}cpmp_train_oof.csv')
df2 = df2.set_index('utrip_id')
df2 = df2[ [f'rec{x}' for x in range(TOP_K)] + [f'prob{x}' for x in range(TOP_K)] ]
df2.head()

In [None]:
# GRU-SM-SMF MODEL SOFTMAX OOF
df3 = pd.read_csv(f'{GRU_PATH}MLP-RNN_train_and_test_7runs_v0.csv')
df3 = df3.set_index('utrip_id')
df3 = df3[ [f'city_id_{x}' for x in range(1,TOP_K+1)] + [f'city_score_{x}' for x in range(1,TOP_K+1)] ]
df3.head()

In [None]:
# PREDICT IN CHUNKS
CHUNK = 1024*4
    
valid = []

# 5 FOLD VALIDATION
for fold in range(5):
    print('#'*25)
    print('### FOLD %i'%(fold+1))
    
    # ONLY VALIDATE ON LAST CITY TRAIN
    test = raw.loc[ (raw.fold==fold)&(raw.istest==0)&(raw.icount==0)&(raw.N>=4) ].copy()
        
    print('### valid shape', test.shape )
    print('#'*25)

    test.reset_index(drop=True,inplace=True)
    TOP4 = np.zeros((test.shape[0],4))

    print( test.shape )
    for k in range(test.shape[0]//CHUNK + 1):
        
        a = k*CHUNK
        b = (k+1)*CHUNK
        b = min(test.shape[0],b)
        
        preds = np.zeros((b-a,t_ct))
        print('Fold %i Chunk %i to %i'%(fold+1,a,b))
                        
        # ENSEMBLE WITH GRU-SM-SMF
        idx = test['utrip_id'].iloc[a:b].values
        preds4 = np.zeros_like(preds)
        gru = df3.loc[idx].values
        for j in range(preds4.shape[0]):
            col_idx = gru[j,:TOP_K].astype('int')
            preds4[j,col_idx] = gru[j,TOP_K:]
        
        # ENSEMBLE WITH XLNet-SMF
        idx = test['utrip_id'].iloc[a:b].values
        preds2 = np.zeros_like(preds)
            
        for j in range(5):
            xlnet = dfs[j].loc[idx].values
            for i in range(preds2.shape[0]):
                col_idx = xlnet[i,:TOP_K].astype('int')
                preds2[i,col_idx] += xlnet[i,TOP_K:]
        preds2 /= 5.0
            
        # ENSEMBLE WITH MLP-SMF
        idx = test['utrip_id'].iloc[a:b].values
        preds3 = np.zeros_like(preds)
        mlp = df2.loc[idx].values
        for j in range(preds3.shape[0]):
            col_idx = mlp[j,:TOP_K].astype('int')
            preds3[j,col_idx] = mlp[j,TOP_K:]
            
        preds = 0.32*preds4 + 0.34*preds2 + 0.34*preds3
                
        for i in range(4):
            x = np.argmax(preds,axis=1)
            TOP4[a:b,i] = x
            for j in range(preds.shape[0]):
                preds[j,x[j]] = -1
                                                                
    for k in range(4):
        test['rec%i'%k] = TOP4[:,k].astype('int')
    valid.append(test)

In [None]:
valid = pd.concat(valid,axis=0,ignore_index=True)

In [None]:
# VALIDATION LAST CITY - FULL OOF
top4_metric( valid, 0, 0, target='city_id' )

# Predict Test Last City Data - 5 Folds - Submission.csv

In [None]:
test = raw.loc[ (raw.istest==1)&(raw.icount==0) ].copy()
print( test.shape )
test.head()

In [None]:
# XLNet-SMF MODEL SOFTMAX PREDS
dfs = []
for k in range(5):
    for j in range(5):
        df = pd.read_csv(f'{TRANSFORMER_PATH}bag_{k}/test_preds_fold_{j}.csv')
        df = df.set_index('utrip_id')
        df = df[ [f'city_id_{x}' for x in range(1,TOP_K+1)] + [f'city_score_{x}' for x in range(1,TOP_K+1)] ]
        dfs.append(df)
dfs[0].head()

In [None]:
# MLP-SMF MODEL SOFTMAX PREDS
df2 = pd.read_csv(f'{MLP_PATH}cpmp_test_final_fixed.csv')
df2 = df2.set_index('utrip_id')
df2 = df2[ [f'rec{x}' for x in range(TOP_K)] + [f'prob{x}' for x in range(TOP_K)] ]
df2.head()

In [None]:
# GRU-MS-SMF MODEL SOFTMAX PREDS
df3 = pd.read_csv(f'{GRU_PATH}MLP-RNN_train_and_test_7runs_v0.csv')
df3 = df3.set_index('utrip_id')
df3 = df3[ [f'city_id_{x}' for x in range(1,TOP_K+1)] + [f'city_score_{x}' for x in range(1,TOP_K+1)] ]
df3.head()

In [None]:
# PREDICT IN CHUNKS
CHUNK = 1024*8
test.reset_index(drop=True,inplace=True)

TOP4 = np.zeros((test.shape[0],4))

print( test.shape )
for k in range(test.shape[0]//CHUNK + 1):

    a = k*CHUNK
    b = (k+1)*CHUNK
    b = min(test.shape[0],b)
    
    preds = np.zeros((b-a,t_ct))
    
    # ENSEMBLE WITH GRU-SM-SMF
    print('GRU Chunk %i to %i'%(a,b))
    idx = test['utrip_id'].iloc[a:b].values
    preds3 = np.zeros_like(preds)
    gru = df3.loc[idx].values
    for i in range(preds3.shape[0]):
        col_idx = gru[i,:TOP_K].astype('int')
        preds3[i,col_idx] += gru[i,TOP_K:]
                
    # ENSEMBLE WITH XLNet-SMF
    print('XLNet Chunk %i to %i'%(a,b))
    idx = test['utrip_id'].iloc[a:b].values
    preds2 = np.zeros_like(preds)
    for j in range(25):
        xlnet = dfs[j].loc[idx].values
        for i in range(preds2.shape[0]):
            col_idx = xlnet[i,:TOP_K].astype('int')
            preds2[i,col_idx] += xlnet[i,TOP_K:]
    preds2 /= 25.
        
    # ENSEMBLE WITH MLP-SMF
    print('MLP Chunk %i to %i'%(a,b))
    idx = test['utrip_id'].iloc[a:b].values
    preds4 = np.zeros_like(preds)
    mlp = df2.loc[idx].values
    for i in range(preds4.shape[0]):
        col_idx = mlp[i,:TOP_K].astype('int')
        preds4[i,col_idx] += mlp[i,TOP_K:]
        
    preds = 0.32*preds3 + 0.34*preds2 + 0.34*preds4
        
    for i in range(4):
        x = np.argmax(preds,axis=1)
        TOP4[a:b,i] = x
        for j in range(preds.shape[0]):
            preds[j,x[j]] = -1

In [None]:
COLS = ['utrip_id']
for k in range(4):
    test['city_id_%i'%(k+1)] = TOP4[:,k].astype('int')
    COLS.append('city_id_%i'%(k+1))

In [None]:
test[COLS].head()

In [None]:
test[COLS].to_csv('final-sub-ensemble-v%i.csv'%VER_E,index=False)