In [1]:
import time
import gc

import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #suppressing GPU warnings

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')   #for plotting learning curves
plt.rc('axes', labelweight='bold', labelsize='medium',
       titleweight='bold', titlesize=12, titlepad=10)

SEED = 2311

In [2]:
train_url = 'https://github.com/sidt-ai/AV_hackathons/blob/main/AmExpert_2021/train.csv?raw=true'
test_url = 'https://raw.githubusercontent.com/sidt-ai/AV_hackathons/main/AmExpert_2021/test.csv'
submission_url = 'https://raw.githubusercontent.com/sidt-ai/AV_hackathons/main/AmExpert_2021/sample_submission.csv'

In [3]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)
submission = pd.read_csv(submission_url)

In [4]:
train['Product_Holding_B1'] = train['Product_Holding_B1'].apply(eval)
train['Product_Holding_B2'] = train['Product_Holding_B2'].apply(eval)
test['Product_Holding_B1'] = test['Product_Holding_B1'].apply(eval)

In [5]:
def expand_list_cols(df, col):
    s = df[col].explode()
    s = pd.crosstab(s.index, s)
    prefix = ''
    if col == 'Product_Holding_B2':
        prefix = 't_' #signifying target
  
    df[prefix + 'P00'] = s['P00']
    for i in range(1, 22):
        try:
            df[prefix + 'P' + str(i)] = s['P' + str(i)]
        except:
            df[prefix + 'P' + str(i)] = 0

    return df

train = expand_list_cols(train, 'Product_Holding_B1')
train = expand_list_cols(train, 'Product_Holding_B2')
test = expand_list_cols(test, 'Product_Holding_B1')

In [6]:
#Age -> AgeRange
bins = pd.IntervalIndex.from_tuples([(0, 23), (23, 32), (32,43), (43, 52), (52, 60), (60,100)])
train['AgeRange'] = pd.cut(train.Age, bins, ordered=True)
test['AgeRange'] = pd.cut(test.Age, bins, ordered=True)

#Vintage -> VintageRange
bins = pd.IntervalIndex.from_tuples([(0, 9), (9, 14), (14,20), (20, 25), (25, 40), (40,100)])
train['VintageRange'] = pd.cut(train.Vintage, bins, ordered=True)
test['VintageRange'] = pd.cut(test.Vintage, bins, ordered=True)

#Number of current holdings
train['Num_Current_Holdings'] = train['Product_Holding_B1'].apply(lambda x: len(x))
test['Num_Current_Holdings'] = test['Product_Holding_B1'].apply(lambda x: len(x))

In [7]:
def encode(df, cols):
    for col in cols:
        df[col] = df[col].astype('category').cat.codes
    return df

cols_to_encode = ['Gender', 'AgeRange', 'VintageRange', 'Is_Active', 'Customer_Category', 'City_Category']
train = encode(train, cols_to_encode)
test = encode(test, cols_to_encode)

In [8]:
gc.collect()

In [9]:
targets = [f for f in train.columns if f[:2] == 't_']

features = [f for f in train.columns 
            if f not in ('Customer_ID', 'fold', 'Product_Holding_B1', 'Product_Holding_B2') 
            and f not in targets]

cat_features = [f for f in features if f not in ('Age', 'Vintage', 'Num_Current_Holdings')]

In [10]:
preprocessor = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

xtrain, xval, ytrain, yval = train_test_split(train[features], train[targets],
                                              test_size=0.2, stratify=train.AgeRange,
                                              random_state=SEED)

xtrain = preprocessor.fit_transform(xtrain)
xval = preprocessor.transform(xval)
xtest = preprocessor.transform(test[features])

gc.collect()

xtrain.shape, xval.shape, xtest.shape

In [11]:
input_shape = xtrain.shape[1]
PATIENCE = 20
MIN_DELTA = 0.0002
BATCH_SIZE = 64
EPOCHS = 100

model = keras.Sequential([
    layers.BatchNormalization(input_shape=[input_shape]),
    layers.Dense(units=32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(units=32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(units=22, activation='sigmoid')
])

model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['binary_accuracy']
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=PATIENCE,
    min_delta=MIN_DELTA,
    restore_best_weights=True,
)

In [12]:
history = model.fit(
    xtrain, ytrain,
    validation_data=(xval, yval),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[
        early_stopping
    ],
    verbose=1
)

In [18]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
ax1.plot(history_df.loc[:, ['loss', 'val_loss']])
ax1.set_title('Loss')
ax2.plot(history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']])
ax2.set_title('Accuracy')
plt.show()

In [14]:
predictions = model.predict(xtest)

In [15]:
result = []
for i in range(test.shape[0]):
    top3 = (np.argsort(predictions[i])[-3:])[::-1]
    r = ['P00' if i == 0 else ('P' + str(i)) for i in top3]
    result.append(r)

In [16]:
submission['Product_Holding_B2'] = result

In [17]:
submission.to_csv('sub3_nn_v1.csv', index=False)

In [19]:
!head sub3_nn_v1.csv