# AmExpert Hackathon 2021

## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# For reading columns with list
from ast import literal_eval

# For Encoding
from sklearn.preprocessing import LabelEncoder

# For Machine Learning Models
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# For Cross Validation
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

## Importing the Datasets

In [2]:
train_original = pd.read_csv('train_go05W65.csv',converters={'Product_Holding_B1': literal_eval,'Product_Holding_B2': literal_eval})
test_original = pd.read_csv('test_VkM91FT.csv',converters={'Product_Holding_B1': literal_eval})

# Making a copy of the datasets
train = train_original.copy()
test = test_original.copy()

In [3]:
train['Product_Holding_B1'][0]

['P16']

## Combining Train and Test Datasets

In [4]:
df = pd.concat([train, test],axis = 0).reset_index(drop = True)
df = pd.concat([train.assign(ind='train'),test.assign(ind='test'),], axis = 0)

## Data Exploration

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58075 entries, 0 to 20326
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Customer_ID         58075 non-null  object
 1   Gender              58075 non-null  object
 2   Age                 58075 non-null  int64 
 3   Vintage             58075 non-null  int64 
 4   Is_Active           58075 non-null  int64 
 5   City_Category       58075 non-null  object
 6   Customer_Category   58075 non-null  object
 7   Product_Holding_B1  58075 non-null  object
 8   Product_Holding_B2  37748 non-null  object
 9   ind                 58075 non-null  object
dtypes: int64(3), object(7)
memory usage: 4.9+ MB


In [6]:
df.describe()

Unnamed: 0,Age,Vintage,Is_Active
count,58075.0,58075.0,58075.0
mean,38.460146,19.585674,0.264572
std,10.11105,10.252426,0.441109
min,24.0,2.0,0.0
25%,29.0,13.0,0.0
50%,37.0,16.0,0.0
75%,47.0,23.0,1.0
max,59.0,80.0,1.0


In [7]:
df.describe(include = 'object')

Unnamed: 0,Customer_ID,Gender,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,ind
count,58075,58075,58075,58075,58075,37748,58075
unique,58075,2,2,3,738,495,2
top,CC264719,Male,C1,S3,[P13],[P00],train
freq,1,39757,29239,27474,11457,5908,37748


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58075 entries, 0 to 20326
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Customer_ID         58075 non-null  object
 1   Gender              58075 non-null  object
 2   Age                 58075 non-null  int64 
 3   Vintage             58075 non-null  int64 
 4   Is_Active           58075 non-null  int64 
 5   City_Category       58075 non-null  object
 6   Customer_Category   58075 non-null  object
 7   Product_Holding_B1  58075 non-null  object
 8   Product_Holding_B2  37748 non-null  object
 9   ind                 58075 non-null  object
dtypes: int64(3), object(7)
memory usage: 4.9+ MB


In [9]:
## Missing Value Imputation
df['Product_Holding_B2'].fillna('N', inplace=True)

### Creating separate columns for each element of the list in the product holding columns and concatenating the dataframes

The list in the column Product_Holding_B1 and Product_Holding_B1 is split into separate columns into a new dataframe. 
These two dataframes are then merged to the original dataframe. 

In [10]:
df_phb1 = pd.DataFrame([{x: y for x, y in enumerate(item)} for item in df['Product_Holding_B1'].values.tolist()], index=df.index)
df_phb1 = df_phb1.add_prefix('HB1_Product')

df_phb2 = pd.DataFrame([{x: y for x, y in enumerate(item)} for item in df['Product_Holding_B2'].values.tolist()], index=df.index)
df_phb2 = df_phb2.add_prefix('HB2_Product')

In [11]:
df_phb2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58075 entries, 0 to 20326
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   HB2_Product0  58075 non-null  object
 1   HB2_Product1  10171 non-null  object
 2   HB2_Product2  3108 non-null   object
 3   HB2_Product3  801 non-null    object
 4   HB2_Product4  197 non-null    object
 5   HB2_Product5  29 non-null     object
 6   HB2_Product6  3 non-null      object
dtypes: object(7)
memory usage: 3.5+ MB


In [12]:
## Joining  the dataframes to the train dataset
concat_df = pd.concat([df, df_phb1,df_phb2], axis=1)

In [13]:
# Dropping columns
concat_df.drop(['Product_Holding_B1','Product_Holding_B2'],inplace = True, axis = 1)

## Missing Value Imputation

In [16]:
concat_df.isnull().sum()

Customer_ID              0
Gender                   0
Age                      0
Vintage                  0
Is_Active                0
City_Category            0
Customer_Category        0
ind                      0
HB1_Product0             0
HB1_Product1         30204
HB1_Product2         49388
HB1_Product3         55541
HB1_Product4         57466
HB1_Product5         57956
HB1_Product6         58057
HB1_Product7         58071
HB2_Product0             0
HB2_Product1         47904
HB2_Product2         54967
HB2_Product3         57274
HB2_Product4         57878
HB2_Product5         58046
HB2_Product6         58072
dtype: int64

In [17]:
# Imputing null values for the entire dataset
concat_df.fillna('N', inplace=True)


## Encoding Categorical Variables

In [18]:
concat_df.select_dtypes(include=['object']).T.apply(lambda x: x.nunique(), axis=1)

Customer_ID          58075
Gender                   2
City_Category            2
Customer_Category        3
ind                      2
HB1_Product0            22
HB1_Product1            18
HB1_Product2            15
HB1_Product3            12
HB1_Product4             9
HB1_Product5             4
HB1_Product6             3
HB1_Product7             2
HB2_Product0            20
HB2_Product1            18
HB2_Product2            16
HB2_Product3            13
HB2_Product4             9
HB2_Product5             6
HB2_Product6             3
dtype: int64

### Encoding nominal variables with few categories

In [19]:
## Encoding nominal variables with few categories
df_dm = pd.get_dummies(concat_df, columns= ['Gender','City_Category','Customer_Category'], prefix_sep='_', drop_first=True)

### Encoding columns with high cardinality

Although LabelEncoder is used only for target variables, it is used here for User_ID column as other target encoders cannot be used. Target encoding will fail here as the target variable is multilabel. 

In [20]:
## 1. Encoding the User_ID column
# Initialising the encoder
le1 = LabelEncoder()

## 1. Encoding Customer_ID Column
df_le = df_dm

df_le['Customer_ID'] = le1.fit_transform(df_le['Customer_ID'])

In [21]:
df_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58075 entries, 0 to 20326
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Customer_ID           58075 non-null  int32 
 1   Age                   58075 non-null  int64 
 2   Vintage               58075 non-null  int64 
 3   Is_Active             58075 non-null  int64 
 4   ind                   58075 non-null  object
 5   HB1_Product0          58075 non-null  object
 6   HB1_Product1          58075 non-null  object
 7   HB1_Product2          58075 non-null  object
 8   HB1_Product3          58075 non-null  object
 9   HB1_Product4          58075 non-null  object
 10  HB1_Product5          58075 non-null  object
 11  HB1_Product6          58075 non-null  object
 12  HB1_Product7          58075 non-null  object
 13  HB2_Product0          58075 non-null  object
 14  HB2_Product1          58075 non-null  object
 15  HB2_Product2          58075 non-null

In [22]:
df_le['HB2_Product1'].value_counts()

N      47904
P8      2560
P12     2533
P7      1051
P10      936
P6       784
P4       528
P5       448
P9       434
P3       344
P13      261
P16      178
P11       68
P14       13
P2        11
P15       11
P17       10
P18        1
Name: HB2_Product1, dtype: int64

In [23]:
## 2. Encoding the categorical columns derived from Product_Holding_B1 and Product_Holding_B2 columns

# Initialising the encoder
le2 = LabelEncoder()

# Fitting the encoder to the column with highest number of classes
df_le['HB1_Product0'] = le2.fit_transform(df_le['HB1_Product0'])

# Creating a new dataframe to select list of object columns excluding 'ind'
df_le2 = df_le
df_le2 = df_le.drop('ind',axis = 1)
object_cols = df_le2.select_dtypes(include=['object'])

# Mapping unknown levels to the encoder
df_le['HB1_Product1'] = df_le['HB1_Product1'].map(lambda s: 'N' if s not in le2.classes_ else s)
le2.classes_ = np.append(le2.classes_, 'N')

# Creating a new dataframe to select list of object columns excluding 'ind'
df_le2 = df_le
df_le2 = df_le.drop('ind',axis = 1)
object_cols = df_le2.select_dtypes(include=['object'])

# Transforming the rest of the object columns with the encoder
for col in object_cols:
    df_le[col] = le2.transform(df_le[col])

In [24]:
df_le['HB2_Product0'].value_counts()

22    20327
20     7580
0      5908
4      4101
18     3649
1      3310
8      3270
16     2354
15     2142
2      1322
5      1124
21     1028
19     1013
17      616
12      195
3       111
7        19
13        4
6         1
9         1
Name: HB2_Product0, dtype: int64

## Splitting into train and test datasets

In [25]:
X_test, X_train = df_le[df_le['ind'].eq('test')], df_le[df_le['ind'].eq('train')]

In [26]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37748 entries, 0 to 37747
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Customer_ID           37748 non-null  int32 
 1   Age                   37748 non-null  int64 
 2   Vintage               37748 non-null  int64 
 3   Is_Active             37748 non-null  int64 
 4   ind                   37748 non-null  object
 5   HB1_Product0          37748 non-null  int32 
 6   HB1_Product1          37748 non-null  int32 
 7   HB1_Product2          37748 non-null  int32 
 8   HB1_Product3          37748 non-null  int32 
 9   HB1_Product4          37748 non-null  int32 
 10  HB1_Product5          37748 non-null  int32 
 11  HB1_Product6          37748 non-null  int32 
 12  HB1_Product7          37748 non-null  int32 
 13  HB2_Product0          37748 non-null  int32 
 14  HB2_Product1          37748 non-null  int32 
 15  HB2_Product2          37748 non-null

In both columns Product_Holding_B1 and Product_Holding_B2, we can see instances of customers having upto 6 to 7 products. However, we will be predicting only the top 3 products for each customer. Hence, only the top 3 product columns for Holding_B1 and Holding_B2 are retained. 

In [27]:
# Retaining the top 3 products for Holding B1 and B2 and dropping the rest (Train dataset)
X_train.drop(['HB1_Product3','HB1_Product4','HB1_Product5','HB1_Product6','HB1_Product7','HB2_Product3','HB2_Product4','HB2_Product5',
'HB2_Product6','ind'], axis = 1, inplace = True)

# Retaining the top 3 products for Holding B1 and dropping the rest (Test dataset)
X_test.drop(['HB1_Product3','HB1_Product4','HB1_Product5','HB1_Product6','HB1_Product7','HB2_Product3','HB2_Product4','HB2_Product5',
'HB2_Product6','HB2_Product0','HB2_Product1','HB2_Product2','ind'], axis = 1, inplace = True)

### Splitting Train dataset into X and y

In [28]:
X = X_train.drop(columns = ['HB2_Product0','HB2_Product1','HB2_Product2'], axis=1)
y= X_train[['HB2_Product0','HB2_Product1','HB2_Product2']]

In [29]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37748 entries, 0 to 37747
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   HB2_Product0  37748 non-null  int32
 1   HB2_Product1  37748 non-null  int32
 2   HB2_Product2  37748 non-null  int32
dtypes: int32(3)
memory usage: 737.3 KB


## Fitting and Evaluating the Machine Learning Models on the Train Dataset

To predict values in a multilabel classification, MultiOutputClassifier is used. This will result in a 2D array as an output instead of 1D array in the usual classifier. 

In [32]:
# 3. Random Forest Classifier
rf_model = MultiOutputClassifier(RandomForestClassifier())
rf_score = cross_val_score(rf_model, X, y, cv=5)
acc_rf = rf_score.mean()
print(" Accuracy for Random Forest Classifier:", acc_rf)


 Accuracy for Random Forest Classifier: 0.6184699977805052


In [33]:
# 4. XGBoost Classifier
xgb_model = MultiOutputClassifier(XGBClassifier(eval_metric='mlogloss'))
xgb_score = cross_val_score(xgb_model, X, y, cv=5)
acc_xgb = xgb_score.mean()
print(" Accuracy for XGBoost Classifier:", acc_xgb)


 Accuracy for XGBoost Classifier: 0.6304176773556254


XGBoost Classifier has the highest accuracy of 0.63%. It is chosen as the final model to predict values.

In [37]:
# Fitting the model
xgb_model.fit(X, y)

MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              eval_metric='mlogloss',
                                              gamma=None, gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=None,
                                              max_delta_step=None,
                                              max_depth=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                

In [38]:
# Predicting the value for the test dataset
y_pred = xgb_model.predict(X_test)

# Inverse transforming the predicted values to get the original categories
y_pred = le2.inverse_transform(y_pred.reshape(-1,1)).reshape(y_pred.shape)
  

In [39]:
y_pred

array([['P10', 'N', 'N'],
       ['P8', 'N', 'N'],
       ['P16', 'N', 'N'],
       ...,
       ['P00', 'N', 'N'],
       ['P1', 'P7', 'N'],
       ['P1', 'P8', 'P8']], dtype=object)

In [40]:
# Converting the 2d array into a dataframe with three columns
predictions = pd.DataFrame(y_pred,columns = ['HB2_Product0','HB2_Product1','HB2_Product2'])

# Converting 'N' (none) values into null values
predictions = predictions.replace('N',np.NaN)

In [41]:
# Converting the dataframe columns into a list

def strip_strlist(list_to_strip):
    return [item for item in list_to_strip if type(item) == str]

# Converting the dataframe into a list
pred = predictions.values.tolist()

# Removing nulls from the list
pred_dropna = [strip_strlist(list_entry) for list_entry in pred]

# Map the values to a set
pred_set = list(map(set, pred_dropna))

# Convert the set to a list
pred_list = list(map(list, pred_set))

### Adding Predicted Values to the Submission File

In [42]:
submission = pd.read_csv('sample_submission1.csv')
submission['Product_Holding_B2'] = pred_list
submission.to_csv('sample_submission1.csv', index=False)