In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error

from sklearn.cross_validation  import KFold
from scipy.stats import skew, boxcox
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools
import os
os.chdir("/home/udit/ipython/notebook/all/input")

shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')
            

def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

fair_constant = 0.7
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain




In [17]:


print('\nStarted')
directory = '../input/'
train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')

numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
train_test, ntrain = mungeskewed(train, test, numeric_feats)

# taken from Vladimir's script (https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114)
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)


        def filter_cat(x):
            if x in remove:
                return np.nan
            return x


        train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)

# taken from Ali's script (https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117)
train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))

train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25

print('')
for comb in itertools.combinations(COMB_FEATURE, 2):
    feat = comb[0] + "_" + comb[1]
    train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
    train_test[feat] = train_test[feat].apply(encode)
    print('Combining Columns:', feat)

print('')
for col in categorical_feats:
    print('Analyzing Column:', col)
    train_test[col] = train_test[col].apply(encode)

print(train_test[categorical_feats])

ss = StandardScaler()
train_test[numeric_feats] = \
    ss.fit_transform(train_test[numeric_feats].values)

train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()


Started

('Combining Columns:', 'cat80_cat87')
('Combining Columns:', 'cat80_cat57')
('Combining Columns:', 'cat80_cat12')
('Combining Columns:', 'cat80_cat79')
('Combining Columns:', 'cat80_cat10')
('Combining Columns:', 'cat80_cat7')
('Combining Columns:', 'cat80_cat89')
('Combining Columns:', 'cat80_cat2')
('Combining Columns:', 'cat80_cat72')
('Combining Columns:', 'cat80_cat81')
('Combining Columns:', 'cat80_cat11')
('Combining Columns:', 'cat80_cat1')
('Combining Columns:', 'cat80_cat13')
('Combining Columns:', 'cat80_cat9')
('Combining Columns:', 'cat80_cat3')
('Combining Columns:', 'cat80_cat16')
('Combining Columns:', 'cat80_cat90')
('Combining Columns:', 'cat80_cat23')
('Combining Columns:', 'cat80_cat36')
('Combining Columns:', 'cat80_cat73')
('Combining Columns:', 'cat80_cat103')
('Combining Columns:', 'cat80_cat40')
('Combining Columns:', 'cat80_cat28')
('Combining Columns:', 'cat80_cat111')
('Combining Columns:', 'cat80_cat6')
('Combining Columns:', 'cat80_cat76')
('Comb

MemoryError: 

In [2]:
train=pd.read_csv('processed_trained.csv')
test=pd.read_csv('processed_test.csv')
#train.to_csv('processed_trained.csv')
#test.to_csv('processed_test.csv')

In [3]:
train=train.astype(float)
test=test.astype(float)

In [4]:
print('\nMedian Loss:', train.loss.median())
print('Mean Loss:', train.loss.mean())

##Add dummy features
var=['cat101','cat87','cat10']
dummay=pd.get_dummies(train[var].astype(str))
train=pd.concat([train,dummay],axis=1)
dummay=pd.get_dummies(test[var].astype(str))
test=pd.concat([test,dummay],axis=1)
train=train.drop(var,axis=1)
test=test.drop(var,axis=1)
#desvar=['cat79', 'cat101', 'cat87', 'cat57', 'cat12', 'cat10', 'cat7',
#      'cat89', 'cat2', 'cat72', 'cat11', 'cat13']
#for col in desvar:
#    mapmean=train.groupby(col).loss.mean()
#    train.loc[:,'mean_'+col]=train[col].map(dict(mapmean/max(mapmean)))
#    test.loc[:,'mean_'+col]=test[col].map(dict(mapmean/max(mapmean)))
#    stdmean=train.groupby(col).loss.std()
#    train.loc[:,'std_'+col]=train[col].map(dict(stdmean/max(mapmean)))
#    test.loc[:,'std_'+col]=test[col].map(dict(stdmean/max(mapmean)))

('\nMedian Loss:', 2115.5699999999997)
('Mean Loss:', 3037.3376856699833)


In [5]:
train=train.drop('cat101_32000.0',axis=1)

In [6]:
train=train.fillna(-1)
test=test.fillna(-1)
ids = pd.read_csv('../input/test.csv')['id']
train_y = np.log(train['loss'] + shift)
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)

n_folds = 10
cv_sum = 0
early_stopping = 100
fpred = []
xgb_rounds = []

d_train_full = xgb.DMatrix(train_x, label=train_y)
d_test = xgb.DMatrix(test_x)

kf = KFold(train.shape[0], n_folds=n_folds)

In [12]:
params = {
    'seed': 0,
    'colsample_bytree': 0.5,
    #'silent': 1,
    'subsample': 0.4,
    'learning_rate': 0.002,
    'objective': 'reg:linear',
    'max_depth': 8,
    'min_child_weight': 100,
    'booster': 'gbtree',
      'nthread':4,
    'lambda':0.001,

    'scale_pos_weight':1.0
}

#d_train = xgb.DMatrix(X_train, label=y_train)
#d_valid = xgb.DMatrix(X_val, label=y_val)
watchlist = [(d_train_full, 'train')]

clf = xgb.train(params,
                d_train_full,
                12000,
                watchlist,
                #early_stopping_rounds=50,
                obj=fair_obj,
                feval=xg_eval_mae,
                verbose_eval=50)

#xgb_rounds.append(clf.best_iteration)
#scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
#cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
#print('eval-MAE: %.6f' % cv_score)
y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

temp = pd.DataFrame({'id':ids,'loss':y_pred})
temp.to_csv("xgb_pred_all_10k.csv",index=False)

[0]	train-mae:3235.39
[50]	train-mae:3143.95
[100]	train-mae:2924.38
[150]	train-mae:2678.4
[200]	train-mae:2451.74
[250]	train-mae:2255.03
[300]	train-mae:2087.1
[350]	train-mae:1944.65
[400]	train-mae:1823.97
[450]	train-mae:1721.64
[500]	train-mae:1634.84
[550]	train-mae:1561.32
[600]	train-mae:1498.98
[650]	train-mae:1445.99
[700]	train-mae:1401.02
[750]	train-mae:1362.93
[800]	train-mae:1330.67
[850]	train-mae:1303.28
[900]	train-mae:1280.02
[950]	train-mae:1260.4
[1000]	train-mae:1243.68
[1050]	train-mae:1229.59
[1100]	train-mae:1217.57
[1150]	train-mae:1207.21
[1200]	train-mae:1198.47
[1250]	train-mae:1191.06
[1300]	train-mae:1184.61
[1350]	train-mae:1179.12
[1400]	train-mae:1174.33
[1450]	train-mae:1170.09
[1500]	train-mae:1166.44
[1550]	train-mae:1163.23
[1600]	train-mae:1160.36
[1650]	train-mae:1157.87
[1700]	train-mae:1155.61
[1750]	train-mae:1153.57
[1800]	train-mae:1151.75
[1850]	train-mae:1150.08
[1900]	train-mae:1148.6
[1950]	train-mae:1147.21
[2000]	train-mae:1145.89
[2

In [10]:
temp = pd.DataFrame({'id':ids,'loss':y_pred})
temp.to_csv("xgb_pred_all.csv",index=False)

In [None]:
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i+1))
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

    rand_state = 2016

    params = {
        'seed': 0,
        'colsample_bytree': 0.5,
        #'silent': 1,
        'subsample': 0.5,
        'learning_rate': 0.005,
        'objective': 'reg:linear',
        'max_depth': 12,
        'min_child_weight': 100,
        'booster': 'gbtree',
          'nthread':8,
        'lambda':0.01,
        
        'scale_pos_weight':1.0
    }

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

    clf = xgb.train(params,
                    d_train,
                    10000,
                    watchlist,
                    early_stopping_rounds=50,
                    obj=fair_obj,
                    feval=xg_eval_mae,
                    verbose_eval=50)

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    print('eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift
    
    temp = pd.DataFrame({'id':ids,'result':y_pred})
    temp.to_csv("xgb_pred_fold_"+str(i)+".csv",index=False)
    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score

mpred = pred / n_folds
score = cv_sum / n_folds
print('Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))

print("Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)

now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'xgbmodel_d' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')


 Fold 1
[0]	train-mae:3234.02	eval-mae:3242.34
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 50 rounds.
[50]	train-mae:2779.31	eval-mae:2786.4
