In [3]:
import os
os.chdir("/home/udit/ipython/notebook/all/input")

import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools

shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

fair_constant = 0.7
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain



print('\nStarted')
directory = '../input/'
train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')

numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
train_test, ntrain = mungeskewed(train, test, numeric_feats)

# taken from Vladimir's script (https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114)
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)


        def filter_cat(x):
            if x in remove:
                return np.nan
            return x


        train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)

# taken from Ali's script (https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117)
train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))

train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25

print('')
for comb in itertools.combinations(COMB_FEATURE, 2):
    feat = comb[0] + "_" + comb[1]
    train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
    train_test[feat] = train_test[feat].apply(encode)
    print('Combining Columns:', feat)

print('')
for col in categorical_feats:
    print('Analyzing Column:', col)
    train_test[col] = train_test[col].apply(encode)

print(train_test[categorical_feats])

ss = StandardScaler()
train_test[numeric_feats] = \
    ss.fit_transform(train_test[numeric_feats].values)

train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()

print('\nMedian Loss:', train.loss.median())
print('Mean Loss:', train.loss.mean())

    


Started

('Combining Columns:', 'cat80_cat87')
('Combining Columns:', 'cat80_cat57')
('Combining Columns:', 'cat80_cat12')
('Combining Columns:', 'cat80_cat79')
('Combining Columns:', 'cat80_cat10')
('Combining Columns:', 'cat80_cat7')
('Combining Columns:', 'cat80_cat89')
('Combining Columns:', 'cat80_cat2')
('Combining Columns:', 'cat80_cat72')
('Combining Columns:', 'cat80_cat81')
('Combining Columns:', 'cat80_cat11')
('Combining Columns:', 'cat80_cat1')
('Combining Columns:', 'cat80_cat13')
('Combining Columns:', 'cat80_cat9')
('Combining Columns:', 'cat80_cat3')
('Combining Columns:', 'cat80_cat16')
('Combining Columns:', 'cat80_cat90')
('Combining Columns:', 'cat80_cat23')
('Combining Columns:', 'cat80_cat36')
('Combining Columns:', 'cat80_cat73')
('Combining Columns:', 'cat80_cat103')
('Combining Columns:', 'cat80_cat40')
('Combining Columns:', 'cat80_cat28')
('Combining Columns:', 'cat80_cat111')
('Combining Columns:', 'cat80_cat6')
('Combining Columns:', 'cat80_cat76')
('Comb

In [4]:
ids = pd.read_csv('../input/test.csv')['id']
train_ids=train.id
train_y = np.log(train['loss'] + shift)
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)
t_ids=[]
t_preds=[]
n_folds = 10
cv_sum = 0
early_stopping = 100
fpred = []
xgb_rounds = []

d_train_full = xgb.DMatrix(train_x, label=train_y)
d_test = xgb.DMatrix(test_x)

kf = KFold(train.shape[0], n_folds=n_folds)
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i+1))
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]
    t_ids.append(train_ids[train_index])

    rand_state = 2016

    params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.03,
        'objective': 'reg:linear',
        'max_depth': 12,
        'min_child_weight': 100,
        'booster': 'gbtree'}

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

    clf = xgb.train(params,
                    d_train,
                    100000,
                    watchlist,
                    early_stopping_rounds=50,
                    obj=fair_obj,
                    feval=xg_eval_mae)

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    t_preds.append(scores_val)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    print('eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score

mpred = pred / n_folds
score = cv_sum / n_folds
print('Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))

print("Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)
t_submission=pd.DataFrame({"id":t_ids,"prediction":t_preds})
t_submission.to_csv("trainsubmission.csv",index=False)
now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'xgb_6submission_5fold-average-xgb_fairobj_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')



 Fold 1
[0]	train-mae:3216.67	eval-mae:3225
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 50 rounds.
[1]	train-mae:3174.86	eval-mae:3183.17
[2]	train-mae:3113.79	eval-mae:3122.06
[3]	train-mae:3040.82	eval-mae:3049.05
[4]	train-mae:2961.02	eval-mae:2969.14
[5]	train-mae:2878.43	eval-mae:2886.39
[6]	train-mae:2796.59	eval-mae:2804.27
[7]	train-mae:2716.33	eval-mae:2723.64
[8]	train-mae:2638.13	eval-mae:2645.28
[9]	train-mae:2563.01	eval-mae:2569.78
[10]	train-mae:2491.02	eval-mae:2497.6
[11]	train-mae:2422.71	eval-mae:2429.04
[12]	train-mae:2357.58	eval-mae:2363.82
[13]	train-mae:2295.58	eval-mae:2301.61
[14]	train-mae:2236.89	eval-mae:2242.95
[15]	train-mae:2181.58	eval-mae:2187.85
[16]	train-mae:2128.7	eval-mae:2134.95
[17]	train-mae:2078.25	eval-mae:2084.54
[18]	train-mae:2030.67	eval-mae:2037.09
[19]	train-mae:1985.91	eval-mae:1992.21
[20]	train-mae:1943.09	eval-mae:1949.5
[21]	train-mae:1902.13	eva

In [10]:
for index,p in zip(t_ids,t_preds):
    print(len(index),len(p))

(169486, 18832)
(169486, 18832)
(169486, 18832)
(169486, 18832)
(169486, 18832)
(169486, 18832)
(169486, 18832)
(169486, 18832)
(169487, 18831)
(169487, 18831)


In [15]:
size=0
for p in t_preds:
    size+=len(p)

In [17]:
print(train.shape,size)

((188318, 727), 188318)


In [19]:
id_size=0
for isd in t_ids:
    id_size+=len(isd)
print(id_size)

1694862


In [33]:
t=[]
for i, (train_index, test_index) in enumerate(kf):
    t.extend(train_ids.iloc[test_index])

[1, 2, 5, 10, 11, 13, 14, 20, 23, 24, 25, 33, 34, 41, 47, 48, 49, 51, 52, 55, 57, 60, 61, 66, 73, 76, 86, 89, 90, 93, 100, 105, 107, 111, 112, 116, 118, 125, 130, 139, 140, 144, 146, 149, 153, 156, 162, 166, 167, 171, 172, 173, 174, 175, 186, 191, 197, 200, 202, 205, 207, 208, 212, 215, 218, 222, 223, 225, 226, 228, 232, 233, 237, 244, 245, 248, 259, 260, 276, 277, 285, 289, 291, 292, 293, 298, 306, 308, 316, 318, 319, 320, 322, 324, 325, 327, 333, 335, 337, 343, 351, 354, 383, 386, 388, 393, 401, 402, 404, 406, 409, 412, 414, 418, 422, 424, 427, 430, 433, 435, 437, 439, 440, 442, 443, 449, 450, 451, 454, 458, 474, 476, 478, 481, 484, 488, 493, 499, 502, 511, 516, 517, 522, 526, 527, 534, 536, 538, 540, 542, 550, 552, 560, 561, 562, 563, 567, 568, 572, 576, 577, 578, 584, 586, 590, 592, 596, 598, 599, 603, 605, 606, 607, 608, 610, 611, 612, 614, 617, 621, 625, 626, 627, 630, 632, 640, 644, 650, 653, 657, 658, 663, 667, 670, 674, 677, 679, 681, 685, 687, 691, 693, 699, 700, 702, 711, 71

In [36]:
p=[]
for prd in t_preds:
    p.extend(prd)

In [43]:
prdest=pd.DataFrame({"id":t,"prediction":np.exp(p)-shift})

In [44]:
prdest.to_csv("trainsubmission.csv",index=False)