In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.cross_validation import train_test_split

#### Importing Data

In [2]:
train = pd.read_csv('train.csv')
test =pd.read_csv('test.csv')

#### Visualizing data

In [3]:
train.head()

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42,target
0,id_11,2.302632e-08,0.040182,0.0,1.8e-07,2.302632e-08,2.368421e-08,1.115205e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
1,id_33,7.965789e-06,0.157872,0.0,2.105e-06,2.769737e-07,7.965789e-06,2.433058e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0
2,id_51,7.828947e-08,0.08914,0.0,3.55e-07,4.671053e-08,1.052632e-07,4.276014e-07,gf,ce,...,0,0,0,0,0,0,0,0,0,0
3,id_54,7.894737e-08,0.227239,0.0,1.05e-06,1.381579e-07,2.190789e-07,1.848054e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
4,id_62,3.321053e-06,0.16041,0.0,2.105e-06,2.769737e-07,3.340789e-06,2.152983e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train.columns

Index([u'transaction_id', u'num_var_1', u'num_var_2', u'num_var_3',
       u'num_var_4', u'num_var_5', u'num_var_6', u'num_var_7', u'cat_var_1',
       u'cat_var_2', u'cat_var_3', u'cat_var_4', u'cat_var_5', u'cat_var_6',
       u'cat_var_7', u'cat_var_8', u'cat_var_9', u'cat_var_10', u'cat_var_11',
       u'cat_var_12', u'cat_var_13', u'cat_var_14', u'cat_var_15',
       u'cat_var_16', u'cat_var_17', u'cat_var_18', u'cat_var_19',
       u'cat_var_20', u'cat_var_21', u'cat_var_22', u'cat_var_23',
       u'cat_var_24', u'cat_var_25', u'cat_var_26', u'cat_var_27',
       u'cat_var_28', u'cat_var_29', u'cat_var_30', u'cat_var_31',
       u'cat_var_32', u'cat_var_33', u'cat_var_34', u'cat_var_35',
       u'cat_var_36', u'cat_var_37', u'cat_var_38', u'cat_var_39',
       u'cat_var_40', u'cat_var_41', u'cat_var_42', u'target'],
      dtype='object')

In [5]:
print 'Shape of the Train data:', train.shape
print 'Shape of the Testing Data', test.shape

Shape of the Train data: (348978, 51)
Shape of the Testing Data (523466, 50)


In [6]:
train.dtypes

transaction_id     object
num_var_1         float64
num_var_2         float64
num_var_3         float64
num_var_4         float64
num_var_5         float64
num_var_6         float64
num_var_7         float64
cat_var_1          object
cat_var_2          object
cat_var_3          object
cat_var_4          object
cat_var_5          object
cat_var_6          object
cat_var_7          object
cat_var_8          object
cat_var_9          object
cat_var_10         object
cat_var_11         object
cat_var_12         object
cat_var_13         object
cat_var_14         object
cat_var_15         object
cat_var_16         object
cat_var_17         object
cat_var_18         object
cat_var_19          int64
cat_var_20          int64
cat_var_21          int64
cat_var_22          int64
cat_var_23          int64
cat_var_24          int64
cat_var_25          int64
cat_var_26          int64
cat_var_27          int64
cat_var_28          int64
cat_var_29          int64
cat_var_30          int64
cat_var_31  

#### Preprocess the data

In [7]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [8]:
cols = list(train.columns)
cols.remove('target')
cols.remove('transaction_id')

In [9]:
# One-hot encoding the data

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook

for col in tqdm_notebook(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

A Jupyter Widget




#### Creating Train and Test sets

In [50]:
X = train.drop(['target', 'transaction_id'], axis=1)
y = train.target.values

In [51]:
X_test = test.drop('transaction_id', axis=1)
ids = test.transaction_id

In [52]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.01, random_state=47)

In [53]:
d_train = lgb.Dataset(data=X_train, label=y_train)
d_valid = lgb.Dataset(data=X_test, label=y_valid)

In [54]:
print X_train.shape, y_train.shape
print X_valid.shape, y_valid.shape

(345488, 49) (345488,)
(3490, 49) (3490,)


In [55]:
watchlist = [d_train, d_valid]

#### Training model

In [58]:
params = {
    'learning_rate': 0.0001,
    'application': 'binary',
    'max_depth': 8,
    'num_leaves': 2**8,
    'verbosity':0,
    'metric': 'auc',
    'boosting': 'gbdt',
    'max_bin':256,
    'bagging_fraction': 0.90,
    'bagging_freq': 1,
    'bagging_seed':1,
    'feature_fraction':0.9,
    'feature_fraction_seed':1
}

In [59]:
model = lgb.train(params, train_set=d_train, num_boost_round=5000, valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=10)

Training until validation scores don't improve for 50 rounds.
[10]	training's auc: 0.729435	valid_1's auc: 1
[20]	training's auc: 0.72945	valid_1's auc: 1
[30]	training's auc: 0.729519	valid_1's auc: 1
[40]	training's auc: 0.729788	valid_1's auc: 1
[50]	training's auc: 0.729826	valid_1's auc: 1
Early stopping, best iteration is:
[1]	training's auc: 0.721713	valid_1's auc: 1


#### Predict Labels

In [60]:
p_test = model.predict(X_test)

#### Creating submission file

In [61]:
subm = pd.DataFrame()
subm['transaction_id'] = ids
subm['target'] = p_test

#### Saving file

In [62]:
subm.to_csv('submit.csv', index=False)

In [63]:
pd.read_csv('submit.csv')

Unnamed: 0,transaction_id,target
0,id_1,0.499956
1,id_6,0.499956
2,id_9,0.499958
3,id_14,0.499956
4,id_15,0.499956
5,id_19,0.499957
6,id_20,0.499956
7,id_24,0.499958
8,id_25,0.499960
9,id_28,0.499956
