In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

from preprocessing import preproc
from tools import log_loss_lgbm

from datetime import datetime
import time

path_to_data = "/home/raph/Downloads/"

print("loading data")
training = pd.read_csv(path_to_data+"training.csv")
testing = pd.read_csv(path_to_data+"testing.csv")

loading data


In [2]:
print("changing dates to time stamps")
training["membership_expire_date"] = training.membership_expire_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
training["membership_expire_date"] = training.membership_expire_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

training["transaction_date"] = training.transaction_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
training["transaction_date"] = training.transaction_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

training["registration_init_time"] = training.registration_init_time.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
training["registration_init_time"] = training.registration_init_time.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

testing["membership_expire_date"] = testing.membership_expire_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
testing["membership_expire_date"] = testing.membership_expire_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

testing["transaction_date"] = testing.transaction_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
testing["transaction_date"] = testing.transaction_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

testing["registration_init_time"] = testing.registration_init_time.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
testing["registration_init_time"] = testing.registration_init_time.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

changing dates to time stamps


In [15]:
testing = testing.drop(["registration_init_time"], axis=1)
testing = testing.drop(["transaction_date"], axis=1)
testing = testing.drop(["membership_expire_date"], axis=1)
training = training.drop(["registration_init_time"], axis=1)
training = training.drop(["transaction_date"], axis=1)
training = training.drop(["membership_expire_date"], axis=1)

In [16]:
training.head()

Unnamed: 0,msno,actual_amount_paid,bd,city,gender,is_auto_renew,is_cancel,is_churn,payment_method_id,payment_plan_days,plan_list_price,registered_via,total_number_of_transactions,usual_price_per_day,price_per_day,price_per_day_diff
0,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,99.0,-1,1,-1,1.0,0.0,0,41.0,30.0,99.0,7,3,2.990033,3.2989,0.308867
1,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,149.0,26,15,0,1.0,0.0,0,39.0,30.0,149.0,9,19,788.37454,4.965012,-783.409528
2,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,149.0,-1,1,-1,1.0,0.0,0,41.0,30.0,149.0,7,26,577.585544,4.965012,-572.620532
3,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,149.0,21,18,0,1.0,0.0,0,41.0,30.0,149.0,7,12,4.746045,4.965012,0.218967
4,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,149.0,32,22,0,1.0,0.0,0,39.0,30.0,149.0,9,19,788.37454,4.965012,-783.409528


In [17]:
print("preprocessing")
X_train, y_train = preproc(training, mode='train', oneHot=False)
X_test, y_test = preproc(testing, mode="test", oneHot=False)

# parameters
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

i = 0
K = 5
kf = KFold(n_splits=K, random_state=42, shuffle=True)

# training with KFold Cross Validation
weights = np.zeros(len(y_train))
weights[y_train == 0] = 1
weights[y_train == 1] = 1

preprocessing


In [19]:
results = []
from tools import log_loss
print('Start training...')
for train_index, test_index in kf.split(X_train):
    lgb_train = lgb.Dataset(X_train[train_index], y_train[train_index], weight=weights[train_index])
    lgb_eval = lgb.Dataset(X_train[test_index], y_train[test_index], reference=lgb_train)
    gbm = lgb.train(params,
        train_set=lgb_train,
        num_boost_round=200,
        valid_sets=lgb_eval,
        early_stopping_rounds=30,
        verbose_eval=5,
        feval=log_loss_lgbm)
    res = gbm.predict(X_test)
    i += 1
    results.append(res)
    
    print("my log loss train")
    print(log_loss(y_train[train_index], gbm.predict(X_train[train_index])))
    print("my log loss test")
    print(log_loss(y_train[test_index], gbm.predict(X_train[test_index])))

Start training...
Training until validation scores don't improve for 30 rounds.
[5]	valid_0's binary_logloss: 0.422479	valid_0's log loss: 0.422479
[10]	valid_0's binary_logloss: 0.300725	valid_0's log loss: 0.300725
[15]	valid_0's binary_logloss: 0.240247	valid_0's log loss: 0.240247
[20]	valid_0's binary_logloss: 0.208838	valid_0's log loss: 0.208838
[25]	valid_0's binary_logloss: 0.191959	valid_0's log loss: 0.191959
[30]	valid_0's binary_logloss: 0.183268	valid_0's log loss: 0.183268
[35]	valid_0's binary_logloss: 0.178246	valid_0's log loss: 0.178246
[40]	valid_0's binary_logloss: 0.175447	valid_0's log loss: 0.175447
[45]	valid_0's binary_logloss: 0.173783	valid_0's log loss: 0.173783
[50]	valid_0's binary_logloss: 0.172726	valid_0's log loss: 0.172726
[55]	valid_0's binary_logloss: 0.172049	valid_0's log loss: 0.172049
[60]	valid_0's binary_logloss: 0.171591	valid_0's log loss: 0.171591
[65]	valid_0's binary_logloss: 0.171245	valid_0's log loss: 0.171245
[70]	valid_0's binary_lo

[175]	valid_0's binary_logloss: 0.170999	valid_0's log loss: 0.170999
[180]	valid_0's binary_logloss: 0.170981	valid_0's log loss: 0.170981
[185]	valid_0's binary_logloss: 0.170952	valid_0's log loss: 0.170952
[190]	valid_0's binary_logloss: 0.170932	valid_0's log loss: 0.170932
[195]	valid_0's binary_logloss: 0.170924	valid_0's log loss: 0.170924
[200]	valid_0's binary_logloss: 0.170921	valid_0's log loss: 0.170921
my log loss train
0.166658897017
my log loss test
0.170921095865
Training until validation scores don't improve for 30 rounds.
[5]	valid_0's binary_logloss: 0.422005	valid_0's log loss: 0.422005
[10]	valid_0's binary_logloss: 0.300064	valid_0's log loss: 0.300064
[15]	valid_0's binary_logloss: 0.239342	valid_0's log loss: 0.239342
[20]	valid_0's binary_logloss: 0.207811	valid_0's log loss: 0.207811
[25]	valid_0's binary_logloss: 0.190767	valid_0's log loss: 0.190767
[30]	valid_0's binary_logloss: 0.181917	valid_0's log loss: 0.181917
[35]	valid_0's binary_logloss: 0.176796	

In [20]:
submission = pd.DataFrame((results[0] + results[1] + results[2] + results[3] + results[4]) / 5)

In [21]:
submission.columns = ["is_churn"]

In [22]:
submission.describe()

Unnamed: 0,is_churn
count,907471.0
mean,0.077824
std,0.172544
min,0.001102
25%,0.011251
50%,0.0271
75%,0.04717
max,0.998154


In [23]:
submission["msno"] = testing["msno"]

In [24]:
submission.describe()

Unnamed: 0,is_churn
count,907471.0
mean,0.077824
std,0.172544
min,0.001102
25%,0.011251
50%,0.0271
75%,0.04717
max,0.998154


In [25]:
submission.head()

Unnamed: 0,is_churn,msno
0,0.003117,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=
1,0.044958,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=
2,0.034091,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=
3,0.028496,++0/NopttBsaAn6qHZA2AWWrDg7Me7UOMs1vsyo4tSI=
4,0.204801,++0BJXY8tpirgIhJR14LDM1pnaRosjD1mdO1mIKxlJA=


In [26]:
submission.to_csv('5Kfold_lgbm_drop_dates.csv', header=True, index=False)
print("created submission file")

created submission file


In [None]:
submission.describe()