In [None]:
import pandas as pd 
import numpy as np
from random import seed
import seaborn as sns
import copy
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale, LabelEncoder
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool
from tqdm import tqdm_notebook as tqdm
import time

### Loading the datasets and setting relevant parameters

In [None]:
np.random.seed(42)
seed(42)

In [None]:
train =  pd.read_csv("../input/Train.csv")
test = pd.read_csv("../input/Test.csv")

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
ss = pd.read_csv("../input/SampleSubmission.csv")
ss.tail()

In [None]:
cmap = sns.color_palette("Blues", desat=0.45)
plt.figure(figsize=(20, 12))
matrix = np.triu(train.corr())

sns.heatmap(train.corr(), annot=True, mask=matrix, cmap=cmap, cbar=False)
plt.title("Train set correlation matrix", fontdict={'fontsize':20})
plt.show()

### Creating a dataset with all the products labeled 1 in the test set in a format similar to the submission file <br>"IP X CODE" column

In [None]:
test_true_values = []

for i, row in test.iterrows():
    customer_metadata = row[:8]
    product_labels = row[8:]
    selected_labels = [index for index, product_label  in enumerate(product_labels) if product_label == 1]
    
    for val in test.columns[8:][selected_labels]:
        test_true_values.append(row[0] + ' X ' + val)
print(len(test_true_values))

In [None]:
display(test_true_values[:10])

### Converting the multilabel dataset into a single label dataset using the copy method.

In [None]:
def multilabel_to_singlelabel(dataset, first_multilabel_col):
    single_label = []
    row_counter = 0
    data_columns = dataset.columns
    
    for index, row in dataset.iterrows():
        customer_metadata = row[:first_multilabel_col]
        product_labels = row[first_multilabel_col:]
        selected_labels = [index for index, product_label  in enumerate(product_labels) if product_label == 1]

        for selected_label in selected_labels:
            row_counter += 1
            for product_label in range(len(product_labels)):
                if product_label == selected_label:
                    transformed_labels = list(copy.copy(product_labels))
                    transformed_labels[selected_label] = 0
                    single_label.append(list(customer_metadata) + transformed_labels
                                        + [data_columns[first_multilabel_col+product_label]])
                    
    single_label = pd.DataFrame(single_label)
    single_label.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code', 
                            'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1', '7POT', 
                            '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 
                            'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'new_prediction']
    
    return single_label

### Train Dataset transformed to single labels using the copy-weight method

In [None]:
train = multilabel_to_singlelabel(train, 8)
display(train.head())

### Test Dataset tranformed to single labels using the copy-weight method

In [None]:
display(test.head())

## Feature engineering<br>
### Converting the date into separate day, month and year columns.

In [None]:
def transform_date(data_frame):
    data_frame['join_date'] = pd.to_datetime(data_frame['join_date'], format="%d/%m/%Y")
    data_frame['year_joined'],data_frame['month_joined'],data_frame['day_joined'] = (data_frame.join_date.dt.year, 
                                                                                    data_frame.join_date.dt.month, 
                                                                                    data_frame.join_date.dt.day)
    data_frame = data_frame.drop(columns=['join_date'], inplace=True)

In [None]:
print("Transformed train dataset")
transform_date(train)
display(train.head())
print("Transformed test dataset")
transform_date(test)
display(test.head())

### Fixing missing values

In [None]:
print('Train null values\n')
print(train.isnull().sum())
print('Test null values\n')
print(test.isnull().sum())

In [None]:
miss_val_list = ['year_joined', 'month_joined', 'day_joined']
for val in miss_val_list:
    train[val] = train[val].fillna(value=train[val].mean())
    test[val] = test[val].fillna(value=test[val].mean())
    
print("Train set\n")
print(train.isnull().sum())
print("Test set\n")
print(test.isnull().sum())

In [None]:
train.dtypes

In [None]:
test.dtypes

### Adding the age in which the customer joined

In [None]:
train['age_joined'] = train['year_joined'] - train['birth_year']
test['age_joined'] = test['year_joined'] - test['birth_year']

train['age'] = train['year_joined'] - train['birth_year']
test['age'] = test['year_joined'] - test['birth_year']

### Adding the number of years as a customer

In [None]:
train['years_as_customer'] = pd.to_datetime('now').year - train['year_joined']
test['years_as_customer'] = pd.to_datetime('now').year - test['year_joined']

In [None]:
print('Training set head')
display(train.head())
print('Testing set head')
display(test.head())

### Encoding categorical data
#### Finding differences in values between the trainining set and the testing set

In [None]:
list_to_encode = ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']
dataframes = [train, test]
print("Differences in categorical values between the train set and the test set.\n")
for l in list_to_encode:
    print(l, ":", list(set(test[l]).difference(train[l])) if set(test[l]).difference(train[l]) else 0)
    print("-------------------------------------------------------------------------------------------\n")

The categories sex, marital_status, branch_code, occupation_code and occupation_category_code have values in the train set that are absent in the testing set.

#### Combining the two datasets and the doing encoding on the combined dataset

In [None]:
train['train'] = 1
test['train'] = 0

combined = pd.concat([train, test])

In [None]:
combined_one_hot = pd.get_dummies(combined, )
list_to_encode = ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']

combined_one_hot = combined[list_to_encode]
combined_one_hot = pd.get_dummies(combined_one_hot)
combined = combined.drop(columns=list_to_encode)
combined = pd.concat([combined, combined_one_hot], axis=1)

In [None]:
train = combined[combined['train'] == 1]
test = combined[combined['train'] == 0]

train.drop(columns=['train'], axis =1, inplace=True)
test.drop(columns=['train'], axis =1, inplace=True)

display(train.tail(10))
display(test.tail(10))

In [None]:
target = pd.DataFrame(train[['new_prediction']])
features = train.drop(columns=['new_prediction', 'ID'])
test =test.drop(columns=['new_prediction'])

display(features.tail())
display(target.head())
display(test.tail())

In [None]:
le = LabelEncoder()
target.loc[:,'new_prediction'] = le.fit_transform(target.loc[:,'new_prediction'])

### Creating the training and validation sets 

In [None]:
def create_train_valid_pool(features, target):
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.15, random_state=42)
    train_pool = Pool(features, target)
    val_pool = Pool(X_val, y_val) 
    return train_pool, val_pool

def create_train_valid_set(features, target):
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.25, random_state=42)
    return X_train, X_val, y_train, y_val

In [None]:
train_pool, val_pool= create_train_valid_pool(features, target)
X_train, X_val, y_train, y_val = create_train_valid_set(features, target)

start_time = time.time()

clf1 = CatBoostClassifier(
    iterations=5000, # 10000, 
    random_state=42, 
    learning_rate=0.0245, 
    task_type='GPU', 
    devices='0', 
    verbose=True
)
model = clf1.fit(train_pool, eval_set=val_pool, plot=False)
print('Total training time:',(time.time() - start_time)/60, 'minutes.')

In [None]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
fi_names=[]

for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    fi_names.append([name] + [score])
    #print("{},{}".format(name, score))
fi = pd.DataFrame(fi_names)
fi.columns = ['Feature', 'Score']

low_fi_list = fi[fi['Score'] <= 0.000]
low_fi_list = list(low_fi_list.drop(columns=['Score']).values.flatten())
#display(low_fi_list)

In [None]:
low_fi_list

In [None]:
#low_fi_list.reset_index(drop=True, inplace=True)
fi = pd.DataFrame(fi)
fi.to_csv('fi.csv', index=False)

In [None]:
features_2 = features.drop(columns=low_fi_list)
test_2 = test.drop(columns=low_fi_list)

train_pool, val_pool = create_train_valid_pool(features_2, target)

In [None]:
start_time = time.time()

clf1 = CatBoostClassifier(
    iterations=5000, # 10000, 
    random_state=42, 
    learning_rate=0.0245, 
    task_type='GPU', 
    devices='0', 
    verbose=True
)
model = clf1.fit(train_pool, eval_set=val_pool, plot=False)

In [None]:
X_test = test_2
proba = model.predict_proba(X_test.drop(columns=['ID'], axis=1))
y_test = pd.DataFrame(proba)
y_test.columns = le.inverse_transform(y_test.columns)

In [None]:
display(y_test.tail())

In [None]:
X_test.tail()

### Creating the submission file

In [None]:
prediction_list = []
test = test_2
for row in tqdm(range(y_test.shape[0])):
    ID = test['ID'].iloc[row]
    for column in y_test.columns:
        prediction_list.append([ID + ' X ' + column, y_test[column].iloc[row]])       
        
prediction_df = pd.DataFrame(prediction_list)
prediction_df.columns = ['ID X PCODE', 'Label']

cleaned_data = [['ID X PCODE', 'Label']]
for [code, label] in tqdm(prediction_df.values):
    cleaned_label = 1.0 if code in test_true_values else label
    cleaned_data.append([code, cleaned_label])
    
headers = cleaned_data.pop(0)
cleaned_prediction_df = pd.DataFrame(cleaned_data, columns=headers)

In [None]:
cleaned_prediction_df.tail()

In [None]:
cleaned_prediction_df.reset_index(drop=True, inplace=True)
cleaned_prediction_df.to_csv('submission_final_84.csv', index=False)

print('Total training time:',(time.time() - start_time)/60, 'minutes.')

### References <br>
1. Gibaja, Eva & Ventura, Sebastian. (2015). A Tutorial on Multi-Label Learning. ACM Computing Surveys. 47.10.1145/2716262. <br>
https://www.researchgate.net/publication/270337594_A_Tutorial_on_Multi-Label_Learning
2. Modelling tabular data with CatBoost and NODE<br>
https://towardsdatascience.com/modelling-tabular-data-with-catboost-and-node-929bfbaaeb08
3. Zimnat Recommendation Challenge<br>
https://github.com/Tixonmavrin/Zindi-Zimnat-Insurance-Recommendation-Challenge/blob/master/Baseline1.ipynb
4. Exploring Embeddings for Categorical Variables with Keras<br>
http://flovv.github.io/Embeddings_with_keras/