In [4]:
import time
t0 = time.clock()

import pandas as pd 
import numpy as np 
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from scipy.optimize import fmin_powell
from ml_metrics import quadratic_weighted_kappa
from sklearn import grid_search

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)
    
def get_params():
    
    params = {}
    params["objective"] = "reg:linear"     
    params["eta"] = 0.038968208713327845
    params["gamma"] = 0.83745926973096763
    params["max_delta_step"] = 6.6502581223657851
    params["min_child_weight"] = 50.31287747915195
    params["subsample"] = 0.89026862324253775
    params["colsample_bytree"] = 0.86459387506389063
    params["silent"] = 1
    params["max_depth"] = 10
    plst = list(params.items())
    return plst
    
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score


# global variables
columns_to_drop = ['Id', 'Response']
xgb_num_rounds = 500 #5000 gives me good score (~15 min), 10000 (~32 min) (no improvement)
num_classes = 8

print("Load the data using pandas")

DATA_TRAIN_PATH = '/Users/patrickkennedy/Desktop/Project DATA/train.csv'
DATA_TEST_PATH = '/Users/patrickkennedy/Desktop/Project DATA/test.csv'


train = pd.read_csv(DATA_TRAIN_PATH)
test = pd.read_csv(DATA_TEST_PATH)

# combine train and test
all_data = train.append(test)
print('Eliminate missing values')    
# Use -1 for any others
all_data.fillna(-1, inplace=True)

all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]

# fix the dtype on the label column
all_data['Response'] = all_data['Response'].astype(int)

# Provide split column
#all_data_new['Split'] = np.random.randint(5, size=all_data_new.shape[0])

# split train and test
train = all_data[all_data['Response']>0].copy()
test = all_data[all_data['Response']<1].copy()

#y = train['Response']
#y_test = test['Response']
#train = train.drop(columns_to_drop, axis=1)
#test = test.drop(columns_to_drop, axis=1)
    
# convert data to xgb data structure
xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), train['Response'].values)
xgtest = xgb.DMatrix(test.drop(columns_to_drop, axis=1), label=test['Response'].values)    

# get the parameters for xgboost
plst = get_params()
print plst      


Load the data using pandas
Eliminate missing values
[('colsample_bytree', 0.8645938750638906), ('silent', 1), ('max_delta_step', 6.650258122365785), ('min_child_weight', 50.31287747915195), ('subsample', 0.8902686232425377), ('eta', 0.038968208713327845), ('objective', 'reg:linear'), ('max_depth', 10), ('gamma', 0.8374592697309676)]


In [5]:
model = xgb.train(plst, xgtrain, xgb_num_rounds) 

In [6]:
# get preds
train_preds = model.predict(xgtrain, ntree_limit=model.best_iteration)
print('Train score is:', eval_wrapper(train_preds, train['Response'].values)) 
test_preds = model.predict(xgtest, ntree_limit=model.best_iteration)


#where to digitize? here instead of the clipping?... by digitizing we get a better first guess? i guess...

bins = np.array([0.0, 1.0, 2.5, 3.2, 3.6, 4.0, 10.0])  #change these bins and test the result, juke up the stats
                                                       #is there a way to use an optimization function for these?

train_preds = np.clip(train_preds, -0.99, 8.99)
test_preds = np.clip(test_preds, -0.99, 8.99)

# train offsets 
#or here in the offset function?
offsets = np.ones(num_classes) * -0.5
offset_train_preds = np.vstack((train_preds, train_preds, train['Response'].values))

for j in range(num_classes):
    train_offset = lambda x: -apply_offset(offset_train_preds, x, j)
    offsets[j] = fmin_powell(train_offset, offsets[j])  


# apply offsets to test
data = np.vstack((test_preds, test_preds, test['Response'].values))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]    
final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)



preds_out = pd.DataFrame({"Id": test['Id'].values, "Response": final_test_preds})
preds_out = preds_out.set_index('Id')
preds_out.to_csv('xgb_offset_submission.csv')

print time.clock()-t0

('Train score is:', 0.748203457801927)
Optimization terminated successfully.
         Current function value: -0.748203
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.752895
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.768901
         Iterations: 2
         Function evaluations: 40
Optimization terminated successfully.
         Current function value: -0.781453
         Iterations: 2
         Function evaluations: 72
Optimization terminated successfully.
         Current function value: -0.784528
         Iterations: 2
         Function evaluations: 93
Optimization terminated successfully.
         Current function value: -0.784659
         Iterations: 2
         Function evaluations: 51
Optimization terminated successfully.
         Current function value: -0.786547
         Iterations: 2
         Function evaluations

In [None]:
# train offsets 
#set initial guess for offsets at 0
offsets = np.array(range(num_classes))*0

#stack the data in three cols, apply offsets to data[1] given member of particular class (daat[0].astype(int)==j)
data = np.vstack((train_preds, train_preds, train['Response'].values))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 

#apply fmin_powell to each class in order 1,2,3,4,5,6,7,8    
for j in range(num_classes):
    train_offset = lambda x: -apply_offset(data, x, j)
    offsets[j] = fmin_powell(train_offset, offsets[j])  


# apply offsets to test
data = np.vstack((test_preds, test_preds, test['Response'].values))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 

    
#force structure of final predictions to be in range 1-8
final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)




In [12]:
np.array(range(8))*0

array([0, 0, 0, 0, 0, 0, 0, 0])

In [18]:
import itertools
dum = [x for x in itertools.permutations([0, 1, 2, 3, 4, 5, 6, 7])]

In [19]:
len(dum)

40320

In [20]:
56*30*24

40320