# Data Submission to Kaggle

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [2]:
train = pd.read_csv('./data/train_df.csv', index_col=0)

In [3]:
test = pd.read_csv('./data/test_df.csv', index_col=0)

In [4]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['country_destination'])
lb = LabelBinarizer()
lb.fit(train['target'])

LabelBinarizer()

In [5]:
target = train['target']
feature = train.drop(['target', 'country_destination'], axis=1)

In [10]:
def submit_data(name, pred):
    """function to submit data to kaggle 
    in the required form

    Args:
        name: name of the output csv file
        pred: predictions generated from algorithm

    Return:
        csv file in the required kaggle format
    """
    id_test = test.index.to_series()
    ids = []
    cts = []
    for i in range(len(id_test)):
        idx = id_test[i]
        ids += [idx] * 5
        cts += le.inverse_transform(np.argsort(pred[i])[::-1])[:5].tolist()
    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    sub.to_csv('./data/'+name, index=False)

In [6]:
rfc = RandomForestClassifier()
rfc.fit(feature, target)
rf_pred = rfc.predict_proba(test)

In [51]:
submit_data('rfc_base.csv', rf_pred)

In [15]:
et = ExtraTreesClassifier()
et.fit(feature, target)
et_pred = et.predict_proba(test)

In [16]:
submit_data('et_base.csv', et_pred)

In [17]:
lgb = LGBMClassifier()
lgb.fit(feature, target)
lgb_pred = lgb.predict_proba(test)

In [18]:
submit_data('lgb_base.csv', lgb_pred)

In [19]:
rf_tuned = pickle.load(open('./data/rf_tuned.sav', 'rb'))
rf_tuned_pred = rf_tuned.predict_proba(test)

In [20]:
rf_tuned

RandomForestClassifier(max_depth=16, min_impurity_split=10, min_samples_leaf=5,
                       n_estimators=1000)

In [21]:
submit_data('rf_tuned.csv', rf_tuned_pred)

In [22]:
et_tuned = pickle.load(open('./data/et_tuned.sav', 'rb'))
et_tuned_pred = et_tuned.predict_proba(test)

In [23]:
et_tuned

ExtraTreesClassifier(max_depth=24, min_samples_leaf=8, min_samples_split=8,
                     n_estimators=1990)

In [24]:
submit_data('et_tuned.csv', et_tuned_pred)