In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

In [None]:
train = pd.read_csv("../input/train.csv").merge(
    pd.read_csv("../input/members.csv"),
    on="msno"
)
test = pd.read_csv("../input/sample_submission_zero.csv").merge(
    pd.read_csv("../input/members.csv"),
    on="msno",
    how="left"
)

# Data

Let's explore our data (include description and correlations)

In [None]:
train.head()

In [None]:
train.describe(include="all")

In [None]:
train.corr()

So - where no features with big correlation with is_churn. Anyway, continue.

# Crossvalidation

Let's define out cross-validation function (I'll use it because we haven't "predict_proba" on DummyRegressor and we need some changes in log_loss)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

def cv(model, X, y, predictor, random_state=None):
    kfold = StratifiedKFold(shuffle=True,
                            random_state=random_state)
    initial_params = model.get_params()
    losses = []
    for i, indices in enumerate(kfold.split(X, y)):
        print("Fold {0}".format(i + 1))
        train_index, test_index = indices
        model.set_params(**initial_params)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        p_predicted = predictor(model, X_test)
        p_predicted[p_predicted > 1-10**(-15)] = 1-10**(-15)
        p_predicted[p_predicted < 10**(-15)] = 10**(-15)
        losses.append(log_loss(y_test, p_predicted))
    return np.array(losses)

# Dummy model

In [None]:
from sklearn.dummy import DummyRegressor

SEED = 42

cv(DummyRegressor("constant", constant=train["is_churn"].mean()),
   np.zeros([len(train),1]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict(X))

# Features

Let's try to build XGBClassifier base on our features.

## registered_via

I'll start from feature with bigger absolute corellation value

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via"]]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

# city
Let's add city feature.

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via", "city"]]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

Seems like it must be good idea to make set of categorical features city0 - cityN. Let's check it:

In [None]:
city_values = list(set(train["city"]))
city_values.sort()

city_features = ["city{0}".format(city) for city in city_values]
for city in city_values:
    train["city{0}".format(city)] = train["city"] == city
    test["city{0}".format(city)] = test["city"] == city

cv(xgb.XGBClassifier(),
   np.array(train[["registered_via"] + city_features]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

## bd

bd - age feature. As written in description - it have some outliners, so let's see histogram

In [None]:
plt.hist(train["bd"]);

In [None]:
plt.hist(np.log(np.abs(train["bd"]) + 0.001));

In [None]:
train["bd"].min(), train["bd"].max(), train["bd"].mean(), train["bd"].std()

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via", "bd"] + city_features]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

I didn't find a way to replace outliners with better cv score, so continue

## gender

There we have categorical feature (male/female). So let's convert it to binary feature - but note that we have missing values

In [None]:
def gender(val):
    if val == "male":
        return 1
    elif val == "female":
        return -1
    else:
        return float("NaN")
    
train["gender_converted"] = train["gender"].apply(gender)
test["gender_converted"] = test["gender"].apply(gender)

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via", "bd", "gender_converted"] + city_features]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

## registration_init_time

Let's build set of features:
- parse date
- convert date to unix timestamp (to make numerical feature)
- add year/month/day features (e.g. to find season changes)

### unix time

In [None]:
def datetime_to_unix(dt):
    epoch = pd.to_datetime('1970-01-01')
    return (dt - epoch).total_seconds()


train["registration_init_time_date"] = pd.to_datetime(train["registration_init_time"], format="%Y%m%d")
test["registration_init_time_date"] = pd.to_datetime(test["registration_init_time"], format="%Y%m%d")
train["registration_init_time_unix"] = train["registration_init_time_date"].apply(datetime_to_unix)
test["registration_init_time_unix"] = test["registration_init_time_date"].apply(datetime_to_unix)

### year/month/day

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via", "bd", "gender_converted", "registration_init_time_unix"] + 
                  city_features]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

In [None]:
train["registration_init_time_year"] = train["registration_init_time_date"].apply(lambda date: date.year)
test["registration_init_time_year"] = test["registration_init_time_date"].apply(lambda date: date.year)
train["registration_init_time_month"] = train["registration_init_time_date"].apply(lambda date: date.month)
test["registration_init_time_month"] = test["registration_init_time_date"].apply(lambda date: date.month)
train["registration_init_time_day"] = train["registration_init_time_date"].apply(lambda date: date.day)
test["registration_init_time_day"] = test["registration_init_time_date"].apply(lambda date: date.day)

Let's train model on all data and make prediction for test records

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via", "bd", "gender_converted", "registration_init_time_unix",
                   "registration_init_time_year", "registration_init_time_month", "registration_init_time_day"] + 
                  city_features]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

In [None]:
from collections import OrderedDict

clf = xgb.XGBClassifier()
clf.fit(
    np.array(train[["registered_via", "bd", "gender_converted", "registration_init_time_unix",
                   "registration_init_time_year", "registration_init_time_month", "registration_init_time_day"] + 
                  city_features]),
    np.array(train["is_churn"])
)
prediction = clf.predict_proba(np.array(test[["registered_via", "bd",
                                              "gender_converted", 
                                              "registration_init_time_unix",
                                              "registration_init_time_year", 
                                              "registration_init_time_month",
                                              "registration_init_time_day"] + 
                                             city_features]))[:, 1]
prediction_df = pd.DataFrame(OrderedDict([ ("msno", test["msno"]), ("is_churn", prediction) ]))
prediction_df.head()

In [None]:
prediction_df.to_csv("prediction.csv", index=False)

## expiration_date

Let's try to build unix time feature from expiration_date

In [None]:
train["expiration_date"] = pd.to_datetime(train["expiration_date"], format="%Y%m%d")
test["expiration_date"] = pd.to_datetime(test["expiration_date"], format="%Y%m%d")

In [None]:
train["expiration_date_unix"] = train["expiration_date"].apply(datetime_to_unix)
test["expiration_date_unix"] = test["expiration_date"].apply(datetime_to_unix)

In [None]:
cv(xgb.XGBClassifier(),
   np.array(train[["registered_via", "bd", "gender_converted", "registration_init_time_unix",
                   "registration_init_time_year", "registration_init_time_month", "registration_init_time_day",
                   "expiration_date_unix"] + 
                  city_features]),
   train["is_churn"],
   random_state=SEED,
   predictor=lambda estimator, X: estimator.predict_proba(X)[:, 1])

Good score? maybe not, in public leatherboard it gives ~= 0.8 score :-)