In [1]:
from multiprocessing import Pool, cpu_count
import gc; gc.enable()
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn import *
import sklearn



In [2]:
# import tables and merge them
train = pd.read_csv('../../data/raw/train.csv')
test = pd.read_csv('../../data/raw/sample_submission_zero.csv')

user_logs = pd.read_csv('../../data/raw/user_logs.csv',nrows=10000000, usecols=['msno'])
user_logs = pd.DataFrame(user_logs['msno'].value_counts().reset_index())
user_logs.columns = ['msno','logs_count']
train = pd.merge(train, user_logs, how='left', on='msno')
test = pd.merge(test, user_logs, how='left', on='msno')
user_logs = []; print('user logs merge...')

members = pd.read_csv('../../data/raw/members.csv')
train = pd.merge(train, members, how='left', on='msno')
test = pd.merge(test, members, how='left', on='msno')
members = []; print('members merge...') 

user logs merge...
members merge...


In [3]:
transactions = pd.read_csv('../../data/raw/transactions.csv',nrows=10000000)
transactions_train = transactions.loc[transactions.transaction_date < 20170201.]
transactions_test = transactions.loc[transactions.transaction_date < 20170301.]
transactions_train_c = pd.DataFrame(transactions_train['msno'].value_counts().reset_index())
transactions_test_c = pd.DataFrame(transactions_test['msno'].value_counts().reset_index())
transactions_train_c.columns = ['msno','trans_count']
transactions_test_c.columns = ['msno','trans_count']
train = pd.merge(train, transactions_train_c, how='left', on='msno')
test = pd.merge(test, transactions_test_c, how='left', on='msno')
print('transaction merge...')

transaction merge...


In [4]:
gender = {'male':1, 'female':2}
train['gender'] = train['gender'].map(gender)
test['gender'] = test['gender'].map(gender)

train = train.fillna(0)
test = test.fillna(0)

In [5]:
# get transaction count
transactions_train = transactions_train.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions_test = transactions_test.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions_train = transactions_train.drop_duplicates(subset=['msno'], keep='first')
transactions_test = transactions_test.drop_duplicates(subset=['msno'], keep='first')

train = pd.merge(train, transactions_train, how='left', on='msno')
test = pd.merge(test, transactions_test, how='left', on='msno')
transactions=[]; transactions_train=[]; transactions_test=[]

In [6]:
train.head(3)

Unnamed: 0,msno,is_churn,logs_count,city,bd,gender,registered_via,registration_init_time,expiration_date,trans_count,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,2.0,18.0,36.0,2.0,9.0,20050406.0,20170907.0,2.0,38.0,30.0,149.0,149.0,0.0,20170107.0,20170206.0,0.0
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,20.0,10.0,38.0,1.0,9.0,20050407.0,20170321.0,10.0,39.0,30.0,149.0,149.0,1.0,20161130.0,20170121.0,0.0
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,5.0,11.0,27.0,2.0,9.0,20051016.0,20170203.0,5.0,39.0,30.0,149.0,149.0,1.0,20170112.0,20170203.0,1.0


In [7]:
# Drop expiration features
for feat in ['transaction_date', 'membership_expire_date', 'expiration_date']:
    train.drop(feat, axis=1, inplace=True)
    test.drop(feat, axis=1, inplace=True)

In [8]:
train.head(3)

Unnamed: 0,msno,is_churn,logs_count,city,bd,gender,registered_via,registration_init_time,trans_count,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,2.0,18.0,36.0,2.0,9.0,20050406.0,2.0,38.0,30.0,149.0,149.0,0.0,0.0
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,20.0,10.0,38.0,1.0,9.0,20050407.0,10.0,39.0,30.0,149.0,149.0,1.0,0.0
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,5.0,11.0,27.0,2.0,9.0,20051016.0,5.0,39.0,30.0,149.0,149.0,1.0,1.0


## XGBOOST

In [11]:
train = train.fillna(0)
test = test.fillna(0)

cols = [c for c in train.columns if c not in ['is_churn','msno']]

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 992931 entries, 0 to 992930
Data columns (total 15 columns):
msno                      992931 non-null object
is_churn                  992931 non-null int64
logs_count                992931 non-null float64
city                      992931 non-null float64
bd                        992931 non-null float64
gender                    992931 non-null float64
registered_via            992931 non-null float64
registration_init_time    992931 non-null float64
trans_count               992931 non-null float64
payment_method_id         992931 non-null float64
payment_plan_days         992931 non-null float64
plan_list_price           992931 non-null float64
actual_amount_paid        992931 non-null float64
is_auto_renew             992931 non-null float64
is_cancel                 992931 non-null float64
dtypes: float64(13), int64(1), object(1)
memory usage: 121.2+ MB


In [9]:
def xgb_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'log_loss', metrics.log_loss(labels, preds)

In [13]:
fold = 1
for i in range(fold):
    params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = model_selection.train_test_split(train[cols], train['is_churn'], test_size=0.3, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 150,  watchlist, feval=xgb_score, maximize=False, verbose_eval=50, early_stopping_rounds=50) #use 1500
    if i != 0:
        pred += model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
    else:
        pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
pred /= fold
test['is_churn'] = pred.clip(0.0000001, 0.999999)

[0]	train-log_loss:0.67718	valid-log_loss:0.677205
Multiple eval metrics have been passed: 'valid-log_loss' will be used for early stopping.

Will train until valid-log_loss hasn't improved in 50 rounds.
[50]	train-log_loss:0.29988	valid-log_loss:0.300796
[100]	train-log_loss:0.207834	valid-log_loss:0.209418


In [15]:
test[['msno','is_churn']].to_csv('submission3.csv', index=False)