In [1]:
import time
import gc

import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #suppressing GPU warnings

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')   #for plotting learning curves
plt.rc('axes', labelweight='bold', labelsize='medium',
       titleweight='bold', titlesize=12, titlepad=10)

SEED = 2311

In [2]:
train_url = 'https://github.com/sidt-ai/AV_hackathons/blob/main/AmExpert_2021/train.csv?raw=true'
test_url = 'https://raw.githubusercontent.com/sidt-ai/AV_hackathons/main/AmExpert_2021/test.csv'
submission_url = 'https://raw.githubusercontent.com/sidt-ai/AV_hackathons/main/AmExpert_2021/sample_submission.csv'

In [3]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)
submission = pd.read_csv(submission_url)

In [4]:
train['Product_Holding_B1'] = train['Product_Holding_B1'].apply(eval)
train['Product_Holding_B2'] = train['Product_Holding_B2'].apply(eval)
test['Product_Holding_B1'] = test['Product_Holding_B1'].apply(eval)

In [5]:
def expand_list_cols(df, col):
    s = df[col].explode()
    s = pd.crosstab(s.index, s)
    prefix = ''
    if col == 'Product_Holding_B2':
        prefix = 't_' #signifying target
  
    df[prefix + 'P00'] = s['P00']
    for i in range(1, 22):
        try:
            df[prefix + 'P' + str(i)] = s['P' + str(i)]
        except:
            df[prefix + 'P' + str(i)] = 0

    return df

train = expand_list_cols(train, 'Product_Holding_B1')
train = expand_list_cols(train, 'Product_Holding_B2')
test = expand_list_cols(test, 'Product_Holding_B1')

In [6]:
#Age -> AgeRange
bins = pd.IntervalIndex.from_tuples([(0, 23), (23, 32), (32,43), (43, 52), (52, 60), (60,100)])
train['AgeRange'] = pd.cut(train.Age, bins, ordered=True)
test['AgeRange'] = pd.cut(test.Age, bins, ordered=True)

#Vintage -> VintageRange
bins = pd.IntervalIndex.from_tuples([(0, 9), (9, 14), (14,20), (20, 25), (25, 40), (40,100)])
train['VintageRange'] = pd.cut(train.Vintage, bins, ordered=True)
test['VintageRange'] = pd.cut(test.Vintage, bins, ordered=True)

#Number of current holdings
train['Num_Current_Holdings'] = train['Product_Holding_B1'].apply(lambda x: len(x))
test['Num_Current_Holdings'] = test['Product_Holding_B1'].apply(lambda x: len(x))

In [7]:
gc.collect()

In [8]:
enc_train = pd.get_dummies(train, columns=['AgeRange', 'VintageRange', 'Customer_Category'])

enc_train = pd.concat(
    [enc_train, 
     pd.get_dummies(train.Gender, prefix='Gender', drop_first=True),
     pd.get_dummies(train.City_Category, prefix='City_Category', drop_first=True)
    ],
    axis=1
)

In [9]:
enc_test = pd.get_dummies(test, columns=['AgeRange', 'VintageRange', 'Customer_Category'])

enc_test = pd.concat(
    [enc_test, 
     pd.get_dummies(test.Gender, prefix='Gender', drop_first=True),
     pd.get_dummies(test.City_Category, prefix='City_Category', drop_first=True)
    ],
    axis=1
)

In [10]:
enc_train.head()

In [11]:
enc_train.drop(['Customer_ID', 'Gender', 'Age', 'Vintage', 
                'City_Category', 'Product_Holding_B1', 'Product_Holding_B2'], 
               axis=1, inplace=True)

enc_train.shape

In [12]:
enc_train.columns

In [13]:
targets = [f for f in enc_train.columns if f[:2] == 't_']

features = [f for f in enc_train.columns if f not in targets]

In [14]:
print(f'Targets: {targets}\n')
print(f'Features: {features}')

In [17]:
xtrain, xval, ytrain, yval = train_test_split(enc_train[features], enc_train[targets],
                                              test_size=0.2, random_state=SEED)

xtrain.shape, xval.shape

In [35]:
input_shape = xtrain.shape[1]
PATIENCE = 10
MIN_DELTA = 0.0002
BATCH_SIZE = 64
EPOCHS = 100

model = keras.Sequential([
    layers.BatchNormalization(input_shape=[input_shape]),
    layers.Dense(units=64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(units=64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(units=32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(units=22, activation='sigmoid')
])

model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['binary_accuracy']
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=PATIENCE,
    min_delta=MIN_DELTA,
    restore_best_weights=True,
)

In [36]:
history = model.fit(
    xtrain, ytrain,
    validation_data=(xval, yval),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[
        early_stopping
    ],
    verbose=2
)

In [37]:
history_df = pd.DataFrame(history.history)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
ax1.plot(history_df.loc[:, ['loss', 'val_loss']])
ax1.set_title('Loss')
ax2.plot(history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']])
ax2.set_title('Accuracy')
plt.show()

In [38]:
predictions = model.predict(enc_test[features])

In [39]:
(np.argsort(predictions[53])[-3:])[::-1]

In [40]:
enc_test.iloc[53]['Product_Holding_B1']

In [41]:
result = []
for i in range(test.shape[0]):
    top3 = (np.argsort(predictions[i])[-3:])[::-1]
    r = ['P00' if x == 0 else ('P' + str(x)) for x in top3]
    priors = enc_test.iloc[i]['Product_Holding_B1']
    for product in r:
        if product in priors:
            r.remove(product)
    result.append(r)

In [42]:
submission['Product_Holding_B2'] = result

In [43]:
submission.to_csv('sub6_nn_v4.csv', index=False)

In [44]:
!head sub6_nn_v4.csv