In [1]:
train_url = 'https://github.com/sidt-ai/AV_hackathons/blob/main/AmExpert_2021/train.csv?raw=true'
test_url = 'https://raw.githubusercontent.com/sidt-ai/AV_hackathons/main/AmExpert_2021/test.csv'
submission_url = 'https://raw.githubusercontent.com/sidt-ai/AV_hackathons/main/AmExpert_2021/sample_submission.csv'

In [2]:
SEED = 2311

In [61]:
import time

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier
from lightgbm import LGBMRanker

In [4]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)
submission = pd.read_csv(submission_url)

In [7]:
train['Product_Holding_B1'] = train['Product_Holding_B1'].apply(eval)
train['Product_Holding_B2'] = train['Product_Holding_B2'].apply(eval)

test['Product_Holding_B1'] = test['Product_Holding_B1'].apply(eval)

In [9]:
s1 = train['Product_Holding_B1'].explode()
s1_df = pd.crosstab(s1.index, s1)

train['P00'] = s1_df['P00']
for i in range(1, 22):
  train['P'+str(i)] = s1_df['P'+str(i)]

In [11]:
s2 = train['Product_Holding_B2'].explode()
s2_df = pd.crosstab(s2.index, s2)

train['tP00'] = s2_df['P00'] #t -> target variables
for i in range(1, 22):
  try:
    train['tP'+str(i)] = s2_df['P'+str(i)]
  except:
    train['tP'+str(i)] = 0

In [13]:
s3 = test['Product_Holding_B1'].explode()
s3_df = pd.crosstab(s3.index, s3)

test['P00'] = s3_df['P00']
for i in range(1, 22):
  test['P'+str(i)] = s3_df['P'+str(i)]

In [15]:
bins = pd.IntervalIndex.from_tuples([(0, 23), (23, 32), (32,43), (43, 52), (52, 60), (60,100)])
train['AgeRange'] = pd.cut(train.Age, bins, ordered=True)
test['AgeRange'] = pd.cut(test.Age, bins, ordered=True)

In [16]:
bins = pd.IntervalIndex.from_tuples([(0, 9), (9, 14), (14,20), (20, 25), (25, 40), (40,100)])
train['VintageRange'] = pd.cut(train.Vintage, bins, ordered=True)
test['VintageRange'] = pd.cut(test.Vintage, bins, ordered=True)

In [17]:
train['Num_Current_Holdings'] = train['Product_Holding_B1'].apply(lambda x: len(x))
test['Num_Current_Holdings'] = test['Product_Holding_B1'].apply(lambda x: len(x))

In [18]:
train.Gender = train.Gender.astype('category').cat.codes
train.AgeRange = train.AgeRange.astype('category').cat.codes
train.VintageRange = train.VintageRange.astype('category').cat.codes
train.Is_Active = train.Is_Active.astype('category').cat.codes
train.City_Category = train.City_Category.astype('category').cat.codes
train.Customer_Category = train.Customer_Category.astype('category').cat.codes

test.Gender = test.Gender.astype('category').cat.codes
test.AgeRange = test.AgeRange.astype('category').cat.codes
test.VintageRange = test.VintageRange.astype('category').cat.codes
test.Is_Active = test.Is_Active.astype('category').cat.codes
test.City_Category = test.City_Category.astype('category').cat.codes
test.Customer_Category = test.Customer_Category.astype('category').cat.codes

In [20]:
# train['fold'] = -1

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# #stratifying based on AgeRange
# for fold, (train_idx, val_idx) in enumerate(skf.split(X=train, y=train.AgeRange)):
#     train.loc[val_idx, 'fold'] = fold

In [21]:
targets = [f for f in train.columns if f[0] == 't']
features = [f for f in train.columns if f not in ('Customer_ID', 'fold', 'Product_Holding_B1', 'Product_Holding_B2') and f not in targets]
cat_features = [f for f in features if f not in ('Age', 'Vintage', 'Num_Current_Holdings')]

In [22]:
print(f'Features: {features}\n')
print(f'Categorical features: {cat_features}\n')
print(f'Targets: {targets}')

Features: ['Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category', 'Customer_Category', 'P00', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'AgeRange', 'VintageRange', 'Num_Current_Holdings']

Categorical features: ['Gender', 'Is_Active', 'City_Category', 'Customer_Category', 'P00', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'AgeRange', 'VintageRange']

Targets: ['tP00', 'tP1', 'tP2', 'tP3', 'tP4', 'tP5', 'tP6', 'tP7', 'tP8', 'tP9', 'tP10', 'tP11', 'tP12', 'tP13', 'tP14', 'tP15', 'tP16', 'tP17', 'tP18', 'tP19', 'tP20', 'tP21']


In [23]:
xtrain, xval, ytrain, yval = train_test_split(train[cat_features], train[targets], test_size=0.2, shuffle=True, random_state=SEED)

xtest = test[cat_features]

In [27]:
estimator1 = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)

In [28]:
moc1 = MultiOutputClassifier(estimator1, n_jobs=-1)

In [29]:
%%time
moc1.fit(xtrain, ytrain)

MultiOutputClassifier(estimator=KNeighborsClassifier(algorithm='auto',
                                                     leaf_size=30,
                                                     metric='minkowski',
                                                     metric_params=None,
                                                     n_jobs=-1, n_neighbors=5,
                                                     p=2, weights='distance'),
                      n_jobs=-1)

In [83]:
%%time
moc1_preds = moc1.predict(xtest)
moc1_probs = moc1.predict_proba(xtest)

CPU times: user 5min 10s, sys: 750 ms, total: 5min 11s
Wall time: 5min 48s


In [88]:
preds_df = pd.DataFrame(moc1_preds, columns=targets)

In [99]:
probs_df = []
for i in range(22):
  probs_df.append(moc1_probs[i][:,0])

probs_df = pd.DataFrame(np.asarray(probs_df).T, columns=targets)

In [106]:
subs_df = probs_df.copy()
final_preds = []
for i in range(test.shape[0]):
  pred = []
  for _ in range(22):
    minlabel = subs_df.idxmin(axis=1)[i]
    if subs_df.loc[i, minlabel] > 0.5 and len(pred):
      break
    pred.append(str(minlabel)[1:])
    subs_df.loc[i, minlabel] = 1.0
  final_preds.append(pred)

In [108]:
submission['Product_Holding_B2'] = final_preds

In [109]:
submission.to_csv('sub1_knn.csv', index=False)

In [110]:
!head sub1_knn.csv

Customer_ID,Product_Holding_B2
CC372708,['P1']
CC216072,['P8']
CC387629,['P16']
CC389228,"['P8', 'P9', 'P12']"
CC394445,['P00']
CC241088,['P8']
CC381551,['P16']
CC238627,['P6']
CC287515,['P8']
