In [35]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools
import os
os.chdir("/home/udit/ipython/notebook/all/input")

shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

fair_constant = 0.7
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain



print('\nStarted')
directory = '../input/'
train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')

numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
train_test, ntrain = mungeskewed(train, test, numeric_feats)

# taken from Vladimir's script (https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114)
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)


        def filter_cat(x):
            if x in remove:
                return np.nan
            return x


        train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)

# taken from Ali's script (https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117)
train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))

train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25


for comb in itertools.combinations(COMB_FEATURE, 2):
        feat = comb[0] + "_" + comb[1]
        train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
        train_test[feat] = train_test[feat].apply(encode)
        print('Combining Columns:', feat)
        
print('')
for col in categorical_feats:
    print('Analyzing Column:', col)
    train_test[col] = train_test[col].apply(encode)

print(train_test[categorical_feats])

ss = StandardScaler()
train_test[numeric_feats] = \
    ss.fit_transform(train_test[numeric_feats].values)

train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()

print('\nMedian Loss:', train.loss.median())
print('Mean Loss:', train.loss.mean())

ids = pd.read_csv('../input/test.csv')['id']
#train_y = np.log(train['loss'] + shift)
#train_x = train.drop(['loss','id'], axis=1)
#test_x = test.drop(['loss','id'], axis=1)


Started

('Analyzing Column:', 'cat1')
('Analyzing Column:', 'cat2')
('Analyzing Column:', 'cat3')
('Analyzing Column:', 'cat4')
('Analyzing Column:', 'cat5')
('Analyzing Column:', 'cat6')
('Analyzing Column:', 'cat7')
('Analyzing Column:', 'cat8')
('Analyzing Column:', 'cat9')
('Analyzing Column:', 'cat10')
('Analyzing Column:', 'cat11')
('Analyzing Column:', 'cat12')
('Analyzing Column:', 'cat13')
('Analyzing Column:', 'cat14')
('Analyzing Column:', 'cat15')
('Analyzing Column:', 'cat16')
('Analyzing Column:', 'cat17')
('Analyzing Column:', 'cat18')
('Analyzing Column:', 'cat19')
('Analyzing Column:', 'cat20')
('Analyzing Column:', 'cat21')
('Analyzing Column:', 'cat22')
('Analyzing Column:', 'cat23')
('Analyzing Column:', 'cat24')
('Analyzing Column:', 'cat25')
('Analyzing Column:', 'cat26')
('Analyzing Column:', 'cat27')
('Analyzing Column:', 'cat28')
('Analyzing Column:', 'cat29')
('Analyzing Column:', 'cat30')
('Analyzing Column:', 'cat31')
('Analyzing Column:', 'cat32')
('Analy

In [112]:
train_y = np.log(train['loss'] + shift)
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)
from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=0.1,normalize=True,solver ='svd',random_state =500)

In [113]:
ridgereg.fit(train_x,train_y)
y_pred = ridgereg.predict(test_x)

In [114]:
y_pred

array([ 7.45429293,  7.67715292,  9.45304118, ...,  7.74219386,
        7.24243787,  8.14787073])

In [36]:
COMB_FEATURE.extend(numeric_feats)

In [37]:
train_y = train['loss'] >60000
train_y=pd.factorize(train_y)[0]
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)

In [61]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
features=COMB_FEATURE
foundvalues=[]

In [78]:
outliervalue=60000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[train['loss'] >outliervalue,'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
outliers=dd.id[dd.pred==1].values
foundvalues.append(outliers)
print('ids to consider',outliers)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.9s finished


88104.5366667
('ids to consider', array([134574]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [79]:
outliervalue=50000
train_y = train['loss']>=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print( train.loc[(train['loss'] > outliervalue) & (train['loss'] <60000) ,'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})

print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    4.6s finished


54750.454
('ids to consider', array([134574, 340105]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [80]:
outliervalue=45000
train_y = train['loss']>=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <50000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.7s finished


47512.7025
('ids to consider', array([134574, 340105]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [81]:
outliervalue=40000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <45000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    8.6s finished


42562.72875
('ids to consider', array([134574, 340105]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [82]:
outliervalue=35000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <40000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.1s finished


37229.7827273
('ids to consider', array([  5062, 134574, 167575, 244173, 272417, 340105]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [83]:
outliervalue=30000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <35000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.0s finished


31992.4851724
('ids to consider', array([  5062, 134574, 167575, 244173, 272417, 340105, 346331]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [75]:
outliervalue=25000
train_y = train['loss'] >outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <30000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.5s finished


26871.6395833
('ids to consider', array([  5062, 134574, 272417, 316492, 340105, 346331, 390735]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [84]:
outliervalue=20000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <25000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.4s finished


21957.6616129
('ids to consider', array([ 13320,  67099, 100794, 120786, 134574, 148845, 173206, 221755,
       222683, 247475, 272417, 340105, 346237, 346331, 369669, 536291]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [85]:
outliervalue=18000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <20000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.4s finished


18920.7969136
('ids to consider', array([ 13320,  54348,  63760,  67099,  76074,  91026, 100794, 120786,
       134574, 148845, 173206, 198598, 201541, 221755, 222683, 247475,
       248401, 255243, 270014, 272417, 316492, 317189, 320790, 340105,
       346331, 369669, 390735, 399937, 516196, 536291, 537199, 537393,
       583966]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [86]:
outliervalue=16000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <18000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   18.3s finished


16866.2985185
('ids to consider', array([  5062,  13320,  43761,  50594,  52607,  54348,  63760,  67099,
        76074,  91026, 100794, 108707, 117377, 120499, 120786, 134574,
       135844, 147997, 148845, 177797, 179718, 198509, 201541, 221755,
       222683, 236220, 242548, 243938, 244173, 246138, 247475, 247717,
       248401, 255243, 270014, 272417, 310350, 313585, 316492, 317189,
       320790, 330451, 339415, 340105, 341360, 341757, 346331, 368866,
       369669, 378956, 390735, 399269, 399937, 407129, 412986, 415786,
       428558, 431351, 437160, 460474, 461160, 501940, 516196, 527816,
       536291, 536587, 537393, 545235, 572250, 577023, 578946, 586360]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [87]:
outliervalue=15000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <16000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.4s finished


15477.2936789
('ids to consider', array([  5062,  13320,  22361,  27363,  43761,  50594,  50726,  52607,
        54348,  55510,  63760,  67099,  71409,  76074,  91026, 100794,
       104009, 108707, 113000, 120499, 120786, 127331, 134574, 135844,
       147997, 148845, 149299, 156677, 159346, 160838, 165557, 177797,
       193082, 198509, 198598, 201541, 208075, 215364, 221755, 222683,
       238518, 242548, 243938, 246138, 247717, 247734, 248401, 254629,
       255243, 261665, 267500, 270014, 275432, 281241, 281983, 295527,
       309706, 316492, 317189, 320790, 330451, 332506, 340105, 341360,
       341757, 346237, 346331, 346956, 367972, 368866, 369669, 378956,
       383082, 384581, 386468, 390735, 399269, 399937, 400255, 407129,
       412986, 414520, 415786, 426802, 428558, 431351, 431668, 437160,
       460474, 461160, 462125, 466920, 471024, 480967, 488678, 501940,
       502919, 516196, 520167, 527816, 536291, 536587, 537199, 537393,
       549891, 569316, 572250, 575000, 5770

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [88]:
outliervalue=13000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <15000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   16.9s finished


13916.4069201
('ids to consider', array([  2458,   4705,   5062,   6385,   7210,   8410,  11748,  12376,
        13320,  14835,  17790,  21439,  22361,  24452,  27363,  27585,
        33069,  36221,  36612,  42126,  43761,  46085,  49760,  50594,
        50678,  50726,  50985,  52607,  54348,  55510,  60107,  60908,
        63760,  64463,  64770,  67099,  69260,  71409,  71989,  76074,
        81573,  84703,  87768,  91026,  93666, 100794, 101076, 102707,
       104009, 108642, 108707, 113000, 114583, 117322, 117377, 120499,
       120786, 126892, 127331, 130018, 130708, 134574, 135844, 137607,
       138307, 143787, 145094, 147997, 148845, 149299, 150384, 152520,
       155923, 156677, 158396, 159346, 165557, 168334, 170184, 176171,
       177797, 178636, 179718, 189333, 190045, 198509, 198598, 201541,
       204090, 208075, 208309, 210178, 211283, 215364, 215380, 215543,
       221755, 222683, 225066, 228101, 233718, 236220, 238518, 240039,
       242548, 242834, 242914, 243134, 2435

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [90]:
outliervalue=12000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <13000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.1s finished


12478.8896287
('ids to consider', array([  2280,   2458,   4705,   5062,   7210,   8410,   9405,  10627,
        11729,  11748,  12201,  12376,  13320,  14835,  16238,  17790,
        19248,  21439,  22361,  23935,  24318,  24452,  27363,  27585,
        33069,  36221,  36612,  42126,  43761,  45448,  46085,  47114,
        47587,  48705,  50594,  50678,  50726,  50985,  51432,  52607,
        54348,  55510,  55591,  60107,  60908,  63760,  64463,  64654,
        64770,  65599,  67099,  68901,  69260,  70843,  71409,  71989,
        72222,  73026,  76074,  77594,  79506,  80856,  81573,  81937,
        82236,  82719,  84703,  87768,  91026,  92670,  93666,  94328,
        96280, 100794, 101076, 102707, 103259, 104009, 108066, 108642,
       108707, 113000, 114583, 116620, 117322, 117377, 120499, 120747,
       120786, 122917, 124891, 126892, 127331, 130018, 130708, 134574,
       135844, 137607, 137749, 138307, 138849, 143787, 144469, 145094,
       146530, 147997, 148845, 149173, 1492

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [91]:
outliervalue=10000
train_y = train['loss'] >=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] >outliervalue) & (train['loss'] <12000),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.6s finished


10903.4878405
('ids to consider', array([   394,   1080,   2280, ..., 586216, 586360, 586395]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [97]:
outliervalue=15
train_y = train['loss'] <=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] <outliervalue),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.8s finished


6.88666666667
('ids to consider', array([334750]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [99]:
outliervalue=60
train_y = train['loss'] <=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] <outliervalue)&(train['loss'] >15),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.4s finished


36.7861538462
('ids to consider', array([ 58499, 395535]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


In [108]:
outliervalue=700
train_y = train['loss'] <=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] < outliervalue)&(train['loss'] >60),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.1s finished


541.801244207
('ids to consider', array([ 19379,  75007, 186199, 220699, 324309, 367135, 378411, 382233,
       396037, 408842, 424707, 509320, 549979]))


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.2s finished


In [None]:
outliervalue=100
train_y = train['loss'] <=outliervalue
train_y=pd.factorize(train_y)[0]
model = RandomForestClassifier(n_estimators = 50, n_jobs = -1,random_state =50, max_features = 0.8, max_depth= 8,oob_score =True,verbose =1)
model.fit(train_x[features], train_y)
preds = model.predict(test_x[features])
print(train.loc[(train['loss'] <outliervalue)&(train['loss'] >60),'loss'].mean())
dd=pd.DataFrame({"id":test.id,'pred':preds})
print('ids to consider',dd.id[dd.pred==1].values)

In [2]:
n_folds = 10
cv_sum = 0
early_stopping = 100
fpred = []
xgb_rounds = []



kf = KFold(train.shape[0], n_folds=n_folds)
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i+1))
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

    rand_state = 2016

    params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.03,
        'objective': 'reg:linear',
        'max_depth': 12,
        'min_child_weight': 100,
        'booster': 'gbtree'}

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]

    clf = xgb.train(params,
                    d_train,
                    100000,
                    watchlist,
                    early_stopping_rounds=50,
                    obj=fair_obj,
                    feval=xg_eval_mae)

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    print('eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score

mpred = pred / n_folds
score = cv_sum / n_folds
print('Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))

print("Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)

now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'xgb_6_submission_5fold-average-xgb_fairobj_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')

udit
