In [46]:
__author__ = 'Vladimir Iglovikov'
import os
os.chdir("/home/udit/ipython/notebook/all/input")
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error
from scipy import stats
train = pd.read_csv('../input/train.csv.zip')
test = pd.read_csv('../input/test.csv.zip')
test['loss'] = np.nan
joined = pd.concat([train, test])

# function - absolute of mean shifted data (which will be later used in function transformer)
def abs_mean_shift(data):
    return np.abs(data - np.mean(data))

def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)
    joined[column] = pd.factorize(joined[column].values, sort=True)[0]
    


In [53]:
from sklearn import preprocessing

def ceate_feature_map(features,featuremap_File):
    outfile = open(featuremap_File, 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()
def plot_feature(bst,featuremap_File):
    importance = bst.get_fscore(fmap=featuremap_File)
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    plt.figure()
    df.plot()
    df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance') 
    return df.sort_values(by='fscore',ascending=False)

def train_Val(params,train,target_labels,boosting_rounds=40,print_every_n=10):
    X_train, X_test, y_train, y_test = train_test_split(train, target_labels, test_size=0.30, random_state=42)
    dtrain = xgb.DMatrix(X_train, label =y_train)
    dtest = xgb.DMatrix(X_test, label =y_test)
    evallist  = [(dtest,'eval'), (dtrain,'train')]
    bst =xgb.train(params,dtrain,num_boost_round=boosting_rounds,evals=evallist,early_stopping_rounds=50,verbose_eval=print_every_n, obj=logregobj, feval=evalerror)
    return bst,X_test,y_test

In [48]:
poly=preprocessing.PolynomialFeatures(2)
cat_feature = [n for n in joined.columns if n.startswith('cat')]    
cont_feature = [n for n in joined.columns if n.startswith('cont')] 

poly_features=poly.fit_transform(joined[cont_feature])
poly_features=pd.DataFrame(poly_features)
poly_features.columns=['polly_'+str(col) for col in poly_features.columns]
joined=joined.drop(cont_feature,axis=1)
joined=pd.concat([joined,poly_features])

In [54]:
from sklearn.cross_validation import train_test_split

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 200
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

RANDOM_STATE = 2016
params = {
    'min_child_weight': 1,
    'eta': 0.1,
    'colsample_bytree': 0.5,
    'max_depth': 12,
    'subsample': 0.8,
    'alpha': 1,
    'gamma': 1,
    'verbose_eval': True,
    'seed': RANDOM_STATE
}
bst,X_test,y_test=train_Val(params,train.values,y,boosting_rounds=100,print_every_n=10)
Dtest=xgb.DMatrix(X_test)
pred=bst.predict(Dtest)
mean_absolute_error(np.expm1(y_test-shift),np.expm1(pred-shift))

[0]	eval-mae:3182.05	train-mae:3174.35
Multiple eval metrics have been passed: 'train-mae' will be used for early stopping.

Will train until train-mae hasn't improved in 50 rounds.
[10]	eval-mae:1369.16	train-mae:1365.43
[20]	eval-mae:497.527	train-mae:495.853
[30]	eval-mae:190.85	train-mae:189.905
[40]	eval-mae:93.6419	train-mae:92.8421
[50]	eval-mae:70.021	train-mae:69.3996
[60]	eval-mae:62.5243	train-mae:62.0543
[70]	eval-mae:61.0504	train-mae:60.6206
[80]	eval-mae:57.9304	train-mae:57.5783
[90]	eval-mae:57.0985	train-mae:56.7847


57.080712325258993

In [55]:
mean_absolute_error(np.expm1(y_test-shift),np.expm1(pred-shift))

0.0

In [None]:
train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 200
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

RANDOM_STATE = 2016
params = {
    'min_child_weight': 1,
    'eta': 0.01,
    'colsample_bytree': 0.5,
    'max_depth': 12,
    'subsample': 0.8,
    'alpha': 1,
    'gamma': 1,
    'silent': 1,
    'verbose_eval': True,
    'seed': RANDOM_STATE
}

xgtrain = xgb.DMatrix(X, label=y)
xgtest = xgb.DMatrix(X_test)

model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)

prediction = np.exp(model.predict(xgtest)) - shift

submission = pd.DataFrame()
submission['loss'] = prediction
submission['id'] = ids
submission.to_csv('sub_v_6.0.csv', index=False)

In [None]:
plt.close()
from matplotlib import rcParams
rcParams['figure.figsize'] = 16, 6
featuremap_File="fet.map"
ceate_feature_map(feature,featuremap_File)
features_imp=plot_feature(bst,featuremap_File)