In [1]:
import os
os.chdir("/home/udit/ipython/notebook/all/input")

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

fair_constant = 2
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

In [3]:
print('\nStarted')
directory = '../input/'
train = pd.read_csv(directory + 'train.csv.zip')
test = pd.read_csv(directory + 'test.csv.zip')

numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
  
train_test, ntrain = mungeskewed(train, test, numeric_feats)

print('')
for comb in itertools.combinations(COMB_FEATURE, 2):
    feat = comb[0] + "_" + comb[1]
    train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
    train_test[feat] = train_test[feat].apply(encode)
    print('Analyzing Columns:', feat)

categorical_feats = [x for x in train_test.columns[1:] if 'cat' in x]

print('')
for col in categorical_feats:
    print('Analyzing Column:', col)
    train_test[col] = train_test[col].apply(encode)

print(train_test[categorical_feats])

ss = StandardScaler()
train_test[numeric_feats] = \
    ss.fit_transform(train_test[numeric_feats].values)

train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()

print('\nMedian Loss:', train.loss.median())
print('Mean Loss:', train.loss.mean())



Started

('Analyzing Columns:', 'cat80_cat87')
('Analyzing Columns:', 'cat80_cat57')
('Analyzing Columns:', 'cat80_cat12')
('Analyzing Columns:', 'cat80_cat79')
('Analyzing Columns:', 'cat80_cat10')
('Analyzing Columns:', 'cat80_cat7')
('Analyzing Columns:', 'cat80_cat89')
('Analyzing Columns:', 'cat80_cat2')
('Analyzing Columns:', 'cat80_cat72')
('Analyzing Columns:', 'cat80_cat81')
('Analyzing Columns:', 'cat80_cat11')
('Analyzing Columns:', 'cat80_cat1')
('Analyzing Columns:', 'cat80_cat13')
('Analyzing Columns:', 'cat80_cat9')
('Analyzing Columns:', 'cat80_cat3')
('Analyzing Columns:', 'cat80_cat16')
('Analyzing Columns:', 'cat80_cat90')
('Analyzing Columns:', 'cat80_cat23')
('Analyzing Columns:', 'cat80_cat36')
('Analyzing Columns:', 'cat80_cat73')
('Analyzing Columns:', 'cat80_cat103')
('Analyzing Columns:', 'cat80_cat40')
('Analyzing Columns:', 'cat80_cat28')
('Analyzing Columns:', 'cat80_cat111')
('Analyzing Columns:', 'cat80_cat6')
('Analyzing Columns:', 'cat80_cat76')
('Anal

In [4]:
train=train[train.loss<13981]
train=train[train.loss>250]

In [5]:
for cat_fe in COMB_FEATURE:
    lossmeanmappig=train.groupby(cat_fe).loss.mean()
    lossmeanmappig/=lossmeanmappig.max()
    lossmeanmappig=dict(lossmeanmappig)
    lossmedianmapping=train.groupby(cat_fe).loss.median()
    lossmedianmapping/=lossmedianmapping.max()
    lossmedianmapping=dict(lossmedianmapping)
    train.loc[:,'lossmean_'+cat_fe]=train[cat_fe].map(lossmeanmappig)
    test.loc[:,'lossmean_'+cat_fe]=test[cat_fe].map(lossmeanmappig)
    train.loc[:,'lossmedian_'+cat_fe]=train[cat_fe].map(lossmedianmapping)
    test.loc[:,'lossmedian_'+cat_fe]=test[cat_fe].map(lossmedianmapping)

In [6]:
for cat_fe in COMB_FEATURE:    
    val=train[cat_fe].append(test[cat_fe])
    probabiltyMapping=dict(val.value_counts()/train.shape[0])
    train.loc[:,'energy_'+cat_fe]=train[cat_fe].map(probabiltyMapping)
    test.loc[:,'energy_'+cat_fe]=test[cat_fe].map(probabiltyMapping)

In [None]:
train=train.astype('float64')
test=test.astype('float64')

In [None]:
ids = pd.read_csv('../input/test.csv.zip')['id']
train_y = np.log(train['loss'] + shift)
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)

n_folds = 10
cv_sum = 0
early_stopping = 100
fpred = []
xgb_rounds = []

d_train_full = xgb.DMatrix(train_x, label=train_y)
d_test = xgb.DMatrix(test_x)

kf = KFold(train.shape[0], n_folds=n_folds)
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i+1))
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

    rand_state = 2016

    params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.03,
        'objective': 'reg:linear',
        'max_depth': 12,
        'min_child_weight': 100,
        'booster': 'gbtree'}

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

    clf = xgb.train(params,
                    d_train,
                    100000,
                    watchlist,
                    early_stopping_rounds=50,
                    obj=fair_obj,
                    feval=xg_eval_mae)

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    print('eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score

mpred = pred / n_folds
score = cv_sum / n_folds
print('Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))

print("Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)

now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'xgbfeatures_energy' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')


 Fold 1
[0]	train-mae:3083.64	eval-mae:3080.03
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 50 rounds.
[1]	train-mae:3078.3	eval-mae:3074.69
[2]	train-mae:3069.84	eval-mae:3066.22
[3]	train-mae:3057.78	eval-mae:3054.15
[4]	train-mae:3041.86	eval-mae:3038.23
[5]	train-mae:3022.05	eval-mae:3018.4
[6]	train-mae:2998.49	eval-mae:2994.79
[7]	train-mae:2971.39	eval-mae:2967.66
[8]	train-mae:2941.04	eval-mae:2937.24
[9]	train-mae:2907.72	eval-mae:2903.86
[10]	train-mae:2871.76	eval-mae:2867.84
[11]	train-mae:2833.69	eval-mae:2829.65
[12]	train-mae:2793.76	eval-mae:2789.6
[13]	train-mae:2752.24	eval-mae:2747.94
[14]	train-mae:2709.59	eval-mae:2705.16
[15]	train-mae:2666	eval-mae:2661.44
[16]	train-mae:2621.71	eval-mae:2617
[17]	train-mae:2577.02	eval-mae:2572.14
[18]	train-mae:2531.86	eval-mae:2526.82
[19]	train-mae:2486.82	eval-mae:2481.58
[20]	train-mae:2441.72	eval-mae:2436.33
[21]	train-mae:2396.85	eval-m