In [None]:
"""Hacked on top of theowl1's excellent kernel
https://www.kaggle.com/the1owl/regressing-during-insomnia-0-21496.

Removed the transaction data beyond 20170201 for the training set - 
this was (part of) the reason the validation score (0.07699) was so 
much better than the leaderboard (0.21496).  This also gave the
misleading impression that the expiration date features were really 
good - those won't carry over well to the test set.

"""

from multiprocessing import Pool, cpu_count
import gc; gc.enable()
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn import *
import sklearn

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/sample_submission_zero.csv')

user_logs = pd.read_csv('../input/user_logs.csv', usecols=['msno'])
user_logs = pd.DataFrame(user_logs['msno'].value_counts().reset_index())
user_logs.columns = ['msno','logs_count']
train = pd.merge(train, user_logs, how='left', on='msno')
test = pd.merge(test, user_logs, how='left', on='msno')
user_logs = []; print('user logs merge...')

members = pd.read_csv('../input/members.csv')
train = pd.merge(train, members, how='left', on='msno')
test = pd.merge(test, members, how='left', on='msno')
members = []; print('members merge...') 

In [None]:

transactions = pd.read_csv('../input/transactions.csv')
transactions_train = transactions.loc[transactions.transaction_date < 20170201.]
transactions_test = transactions.loc[transactions.transaction_date < 20170301.]
transactions_train = pd.DataFrame(transactions_train['msno'].value_counts().reset_index())
transactions_test = pd.DataFrame(transactions_test['msno'].value_counts().reset_index())
transactions_train.columns = ['msno','trans_count']
transactions_test.columns = ['msno','trans_count']
train = pd.merge(train, transactions_train, how='left', on='msno')
test = pd.merge(test, transactions_test, how='left', on='msno')
print('transaction merge...')


In [None]:
gender = {'male':1, 'female':2}
train['gender'] = train['gender'].map(gender)
test['gender'] = test['gender'].map(gender)

train = train.fillna(0)
test = test.fillna(0)

In [None]:
transactions = pd.read_csv('../input/transactions.csv')
transactions_train = transactions.loc[transactions.transaction_date < 20170201.]
transactions_test = transactions.loc[transactions.transaction_date < 20170301.]

transactions_train = transactions_train.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions_test = transactions_test.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions_train = transactions_train.drop_duplicates(subset=['msno'], keep='first')
transactions_test = transactions_test.drop_duplicates(subset=['msno'], keep='first')



In [None]:
train = pd.merge(train, transactions_train, how='left', on='msno')
test = pd.merge(test, transactions_test, how='left', on='msno')
transactions=[]; transactions_train=[]; transactions_test=[]

In [None]:
# Drop expiration features
for feat in ['transaction_date', 'membership_expire_date', 'expiration_date']:
    train.drop(feat, axis=1, inplace=True)
    test.drop(feat, axis=1, inplace=True)

In [None]:
def transform_df(df):
    df = pd.DataFrame(df)
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

def transform_df2(df):
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

df_iter = pd.read_csv('../input/user_logs.csv', low_memory=False, iterator=True, chunksize=10000000)
last_user_logs = []
i = 0 #~400 Million Records - starting at the end but remove locally if needed
for df in df_iter:
    if i>35:
        if len(df)>0:
            print(df.shape)
            p = Pool(cpu_count())
            df = p.map(transform_df, np.array_split(df, cpu_count()))   
            df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
            df = transform_df2(df)
            p.close(); p.join()
            last_user_logs.append(df)
            print('...', df.shape)
            df = []
    i+=1

last_user_logs = pd.concat(last_user_logs, axis=0, ignore_index=True).reset_index(drop=True)
last_user_logs = transform_df2(last_user_logs)

train = pd.merge(train, last_user_logs, how='left', on='msno')
test = pd.merge(test, last_user_logs, how='left', on='msno')
last_user_logs=[]

In [None]:
train = train.fillna(0)
test = test.fillna(0)

cols = [c for c in train.columns if c not in ['is_churn','msno']]

In [None]:
def xgb_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'log_loss', metrics.log_loss(labels, preds)

fold = 1
for i in range(fold):
    params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = model_selection.train_test_split(train[cols], train['is_churn'], test_size=0.3, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 150,  watchlist, feval=xgb_score, maximize=False, verbose_eval=50, early_stopping_rounds=50) #use 1500
    if i != 0:
        pred += model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
    else:
        pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
pred /= fold
test['is_churn'] = pred.clip(0.0000001, 0.999999)
test[['msno','is_churn']].to_csv('submission3.csv.gz', index=False, compression='gzip')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams['figure.figsize'] = (7.0, 7.0)
xgb.plot_importance(booster=model); plt.show()