In [None]:
#Import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [None]:
train= pd.read_csv("../input/train.csv", na_values=-1)
test = pd.read_csv('../input/test.csv', na_values=-1)

In [None]:
for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float16)
    test[c]=test[c].astype(np.float16)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

In [None]:
positive_size = train[train.target==1].shape[0]

In [None]:
positive_percentage = positive_size/float(train.shape[0])

In [None]:
print("%.2f" % (positive_percentage*100), "% of the data is labelled as positive.")

## Missing values

In [None]:
missing_values = train.apply(lambda x: sum(x.isnull())/float(len(x)), axis=0)

In [None]:
print("Top 5 Features with most missing values and the percentage:")
print(missing_values.sort_values(ascending=False).head(10))

In [None]:
train = train.drop(['ps_car_03_cat', 'ps_car_05_cat','id'], axis=1)

## Visualisation

In [None]:
ax = sns.countplot(x = train.target.values ,palette="Set2")
sns.set(font_scale=1.5)
ax.set_xlabel(' ')
ax.set_ylabel(' ')
fig = plt.gcf()
fig.set_size_inches(10,5)
ax.set_ylim(top=700000)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(100*p.get_height()/len(train)), (p.get_x()+ 0.3, p.get_height()+10000))

plt.title('Distribution of 595212 Targets')
plt.xlabel('Initiation of Auto Insurance Claim Next Year')
plt.ylabel('Frequency [%]')
plt.show()

In [None]:
sns.set(style="white")

# Compute the correlation matrix
corr = train.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

In [None]:
abs(corr.target).sort_values()

## Define Gini metric

In [None]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

## Parameters tuning

In [None]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

In [None]:
X = train.drop(['target'], axis=1)
y = train['target']
features = train.drop(['target'], axis=1).columns

In [None]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=32)
folds = skf.split(X, y)

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    objective= 'binary:logistic',
    nthread=5,
    scale_pos_weight=24,
    seed=27)

In [None]:
#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
param_grid = {'learning_rate': [0.05], #so called `eta` value
              'max_depth': [3, 5, 7, 9],
              'min_child_weight': [5, 7, 9],
              'gamma':[0],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
             }

In [None]:
clf = GridSearchCV(xgb_model, param_grid, n_jobs=5, 
                   cv=list(folds), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(X, y)

In [None]:
#trust your CV!
print(clf.grid_scores_)
print(clf.best_params_)
print(clf.best_score_)
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(test[features])[:,1]

In [None]:
modelfit(default_params, X, y, folds=folds, num_round=1000)

## Model fitting

In [None]:
params = {
    'eval_metric': 'auc',
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
    }

In [None]:
sub=test['id'].to_frame()
sub['target']=0

In [None]:
import gc

In [None]:
for i, (train_indices, test_indices) in enumerate(skf.split(X, y)):
    print(test_indices[20:30])

In [None]:
for i, (train_indices, test_indices) in enumerate(skf.split(X, y)):
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    d_train = xgb.DMatrix(X_train, y_train)
    d_test = xgb.DMatrix(X_test, y_test)
    watchlist = [(d_train, 'train'), (d_test, 'valid')]
    xgb_model = xgb.train(params, d_train, 1600, watchlist, early_stopping_rounds=70, 
                    feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / kfold
    print('Fold %d out of %d training completed.' % (i + 1, kfold))
    gc.collect()

In [None]:
sub

In [None]:
sub.to_csv('submit_result.csv', index=None)