## First, run the code to load data and call the scorer function

In [5]:
#seems like the message is to use a parameter optimization method and then use stacking/blending


%timeit
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
from xgboost import XGBClassifier, XGBRegressor
from bayes_opt import BayesianOptimization
from sklearn import grid_search
from scipy.optimize import fmin_powell
from ml_metrics import quadratic_weighted_kappa
from sklearn.metrics import cohen_kappa_score, make_scorer

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MiniBatchKMeans

from sknn.mlp import Classifier, Layer, Regressor



DATA_TRAIN_PATH = '/Users/patrickkennedy/Desktop/Project DATA/train.csv'
DATA_TEST_PATH = '/Users/patrickkennedy/Desktop/Project DATA/test.csv'

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)


def load_data(path_train = DATA_TRAIN_PATH, path_test = DATA_TEST_PATH):
    columns_to_drop = ['Id', 'Response']
    num_classes = 8

    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    # combine train and test
    all_data = train.append(test)
    all_data.fillna(-1, inplace=True)
    all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
    all_data['Response'] = all_data['Response'].astype(int)

    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()
    train_labels = train["Response"]
    
    #train_labels = train_labels.reshape(-1,1) #need to reshape b/c one dim array
    
    train_ids = train["Id"].values
    test_ids = test["Id"].values

    return  train.drop(columns_to_drop, axis=1), \
            train_labels, \
            test.drop(columns_to_drop, axis=1),\
            train_ids, \
            test_ids




## Next, define the functions that test different algos

In [7]:
def xgblinearcv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample,
              colsample_bytree, silent =True, nthread = -1, seed = 1234):
    
    #XGBClassifier only gets up to .47
    return cross_val_score(XGBRegressor(max_depth = int(max_depth), learning_rate = learning_rate,
                                         n_estimators = int(n_estimators), silent = silent,
                                         nthread = nthread, gamma = gamma, min_child_weight = min_child_weight,
                                         max_delta_step = max_delta_step, subsample = subsample,
                                         colsample_bytree = colsample_bytree, seed = seed,
                                         objective = 'reg:linear'),
                           train,
                           labels,
                           'mean_squared_error',
                           n_jobs=-1,
                           cv=3).mean()



def RFcv(n_estimators, max_depth, min_samples_split, min_samples_leaf, 
         min_weight_fraction_leaf, max_leaf_nodes=None, bootstrap=True, 
         oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None):
    
    return cross_val_score(RandomForestClassifier(n_estimators = int(n_estimators), criterion='gini',
                                                 max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                                 min_samples_leaf=int(min_samples_leaf),
                                                 min_weight_fraction_leaf=min_weight_fraction_leaf, max_features='auto',
                                                 max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap,
                                                 oob_score=oob_score, n_jobs=n_jobs, random_state=None, verbose=0,
                                                 warm_start=False, class_weight=None),
                           train,
                           labels,
                           'mean_squared_error',
                           n_jobs=-1,
                           cv=3).mean()




def kNNcv(n_neighbors):
    return cross_val_score(KNeighborsClassifier(n_neighbors=int(n_neighbors)),
                           train,
                           labels,
                           'mean_squared_error',
                           n_jobs=-1,
                           cv=3).mean()



def kMeanscv(n_clusters):
    return cross_val_score(MiniBatchKMeans(n_clusters = int(n_clusters)),
                          train,
                          labels,
                          'mean_squared_error',
                          n_jobs=-1,
                          cv=3).mean()



def xgbpoissoncv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample,
              colsample_bytree, silent =True, nthread = -1, seed = 1234):
    
    #XGBClassifier only gets up to .47
    return cross_val_score(XGBRegressor(max_depth = int(max_depth), learning_rate = learning_rate,
                                         n_estimators = int(n_estimators), silent = silent,
                                         nthread = nthread, gamma = gamma, min_child_weight = min_child_weight,
                                         max_delta_step = max_delta_step, subsample = subsample,
                                         colsample_bytree = colsample_bytree, seed = seed,
                                         objective = 'count:poisson'),
                           train,
                           labels,
                           'mean_squared_error',
                           n_jobs=-1,
                           cv=3).mean()

def xgbmultisoftmaxcv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample,
              colsample_bytree, silent =True, nthread = -1, seed = 1234):
    
    #XGBClassifier only gets up to .47
    return cross_val_score(XGBClassifier(max_depth = int(max_depth), learning_rate = learning_rate,
                                         n_estimators = int(n_estimators), silent = silent,
                                         nthread = nthread, gamma = gamma, min_child_weight = min_child_weight,
                                         max_delta_step = max_delta_step, subsample = subsample,
                                         colsample_bytree = colsample_bytree, seed = seed,
                                         objective = 'reg:linear'),  #will be reset to softmax
                           train,
                           labels,
                           'mean_squared_error',
                           n_jobs=-1,
                           cv=3).mean()



def NNcv(layers, learning_rate, n_iter):
    return cross_val_score(Classifier(layers=[Layer("Linear", units=100),
                                              Layer("Softmax")],
                                     learning_rate=learning_rate,
                                     n_iter=n_iter),
                           train,
                           labels,
                           'mean_squared_error',
                           n_jobs=-1,
                           cv=3).mean()




## Set the variables for the BO

In [13]:
xgblinearBO = BayesianOptimization(xgblinearcv,
                                 {'max_depth': (5, 12),
                                  'learning_rate': (0.001, 0.5),
                                  'n_estimators': (50, 1000),
                                  'gamma': (1., 0.01),
                                  'min_child_weight': (1, 100),
                                  'max_delta_step': (0, 10),
                                  'subsample': (0.01, 0.9),
                                  'colsample_bytree' :(0.5, 0.99)
                                 })

RFBO = BayesianOptimization(RFcv,
                           {'n_estimators' : (50,1000), 
                            'max_depth' : (5, 12), 
                            'min_samples_split' : (2, 10),             
                            'min_samples_leaf' : (1, 10),      
                            'min_weight_fraction_leaf' : (0.0, 0.5),        
                           })



kNNBO = BayesianOptimization(kNNcv,
                            {'n_neighbors' : (3, 1001)
                            })


kMeansBO = BayesianOptimization(kMeanscv,
                               {'n_clusters' : (3,1001)
                               })



xgbpoissonBO = BayesianOptimization(xgbpoissoncv,
                                 {'max_depth': (5, 12),
                                  'learning_rate': (0.001, 0.5),
                                  'n_estimators': (50, 1000),
                                  'gamma': (1., 0.01),
                                  'min_child_weight': (1, 100),
                                  'max_delta_step': (0, 10),
                                  'subsample': (0.01, 0.9),
                                  'colsample_bytree' :(0.5, 0.99)
                                 })

xgbmultisoftmaxBO = BayesianOptimization(xgbmultisoftmaxcv,
                                 {'max_depth': (5, 12),
                                  'learning_rate': (0.001, 0.5),
                                  'n_estimators': (50, 1000),
                                  'gamma': (1., 0.01),
                                  'min_child_weight': (1, 100),
                                  'max_delta_step': (0, 10),
                                  'subsample': (0.01, 0.9),
                                  'colsample_bytree' :(0.5, 0.99)
                                 })


NNBO = BayesianOptimization(NNcv,
                           {'learning_rate': (0.0001, 0.5),
                            'hidden0__units': (2, 200),
                            'hidden0__type': ["Linear", "Rectifier", "Sigmoid", "Tanh"],
                            'n_iter' : (5, 1000)})


## Last, run the optimization procedure

In [14]:
%%time
if __name__ == "__main__":
    # Load data set and target values
    train, labels, test, _, _ = load_data()
    scorer = make_scorer(cohen_kappa_score)
    
    
    
    NNBO.maximize()
    print('-'*53)
    print('Final Results')
    print('Neural Net (4 layer: lin, rec, lin, softmax): %f' % NNBO.res['max']['max_val'])
    
    
    
    
    #xgbpoissonBO.maximize()
    #print('-'*53)
    #print('Final Results')
    #print('XGB-Poisson: %f' % xgbpoissonBO.res['max']['max_val'])
    
    #xgbmultisoftmaxBO.maximize()
    #print('-'*53)
    #print('Final Results')
    #print('XGB-multisoftmax: %f' % xgbmultisoftmaxBO.res['max']['max_val'])
    
    
    
    
    
    
    #kMeansBO.maximize()
    #print('-'*53)
    #print('Final Results')
    #print('kMeans: %f' % kMeansBO.res['max']['max_val'])
    #print('-'*53)
    #print('-'*53) 
    
    #kNNBO.maximize()
    #print('-'*53)
    #print('Final Results')
    #print('kNN: %f' % kNNBO.res['max']['max_val'])
    #print('-'*53)
    #print('-'*53) 
    
    #RFBO.maximize()
    #print('-'*53)
    #print('Final Results')
    #print('Random Forest: %f' % RFBO.res['max']['max_val'])
    #print('-'*53)
    #print('-'*53)    
    
    #xgblinearBO.maximize()
    #print('-'*53)
    #print('Final Results')
    #print('XGBlinear: %f' % xgblinearBO.res['max']['max_val'])
    



#try again with data excepting response and id variables...
#seems like XGBRegressor may be a better fit, try that after XGBClassifier is done




ValueError: could not convert string to float: Linear

In [None]:
#RESULTS:
#1) Random Forest

#Optimization finished with maximum: -5.606285, at position: 
#{'n_estimators': 1000.0, 'min_samples_split': 2.0, 'min_weight_fraction_leaf': 0.0, 
#'max_depth': 12.0, 'min_samples_leaf': 1.0}.
#Time taken: 7 minutes and 59.378842 seconds.
#-----------------------------------------------------



#2) kNN

#Optimization finished with maximum: -9.324511, at position: {'n_neighbors': 10.490161708465756}.
#Time taken: 18 minutes and 37.048673 seconds.
#-----------------------------------------------------



#3) minibatch k means
#Optimization finished with maximum: -23.923574, at position: {'n_clusters': 13.378058163512708}.
#Time taken: 1 minutes and 4.024164 seconds.
#-----------------------------------------------------



#4) XGBClassifier ('multi:softmax')
#Optimization finished with maximum: -4.879299, at position: 
#{'colsample_bytree': 0.91539988110733117, 'learning_rate': 0.061404496828089954, 
#'max_delta_step': 7.4175422582480843, 'min_child_weight': 19.76322700303788, 
#'n_estimators': 358.89453967398725, 'subsample': 0.89312153200130462, 
#'max_depth': 5.1217667820189829, 'gamma': 0.041040228052564909}.
#Time taken: 451 minutes and 16.516589 seconds.
#-----------------------------------------------------


#5) XGBRegressor 'count:poisson'
#Optimization finished with maximum: -3.366464, at position: 
#{'colsample_bytree': 0.51169359165916362, 'learning_rate': 0.36748550147548548, 
#'max_delta_step': 3.7759341131094057, 'min_child_weight': 6.61613391710471, 
#'n_estimators': 623.62927525119301, 'subsample': 0.85582404890610575, 
#'max_depth': 11.777985571852648, 'gamma': 0.13481639738974682}.
#Time taken: 72 minutes and 2.069944 seconds.
#-----------------------------------------------------



#6) XGBRegressor 'reg:linear'
#Optimization finished with maximum: -3.390418, at position: 
#{'colsample_bytree': 0.53194584264551403, 'learning_rate': 0.027414206118598777, 
#'max_delta_step': 2.0801102305073238, 'min_child_weight': 7.4497497328569295, 
#'n_estimators': 829.49940166736644, 'subsample': 0.16906773086480992, 
#'max_depth': 6.1294423576531134, 'gamma': 0.88553320765501153}.
#Time taken: 67 minutes and 16.470244 seconds.
#-----------------------------------------------------


#7)  log / squares of each

#can i do feature importance? and then square those? or something...

#weight each prediction before blending... or double up some of the columns for xgboost?
#then given the blended predictions... find the best params for the new data set against training labels
#then put everything into the big model with the specified params and see the result






In [None]:
#For tomorrow: run through all the BO scripts to get best models -- still need some time
#Go through Udacity R course (1 section) -- check
#Install Linux on USB drive -- check! :-)
#Bonus: get auto-sklearn up and running
#Bonus bonus: get all see dave work pages up, loaded and submitted for another book at createspace



#build a script that runs through my algos with BO and gets the best params, then runs through those models
#with my existing classifier amalgam with those best params and see what pops out the other end
#note if it is any better than before?
#note next steps (i.e. correlated models, whatever else i need...)
#then use autosklearn (after installing linux) and see how well that places me in the competition
#next next steps... genetic algos? neural networks? what else? building my own auto learner? how to optimize?




#can i use BO with a neural network? -- Yes but what about manipulating the units component? or layers?
#also get Linux customized with installations (Anacondas, scikit, etc... also autolearn!)

#using Theano or Lasagna (because it has layers)...


In [18]:
#running the NN script in hyperopt to handle 


from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from ml_metrics import quadratic_weighted_kappa
from sknn.mlp import Classifier, Layer, Regressor




DATA_TRAIN_PATH = '/Users/patrickkennedy/Desktop/Project DATA/train.csv'
DATA_TEST_PATH = '/Users/patrickkennedy/Desktop/Project DATA/test.csv'


def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

def load_data(path_train = DATA_TRAIN_PATH, path_test = DATA_TEST_PATH):
    columns_to_drop = ['Id', 'Response']
    num_classes = 8

    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    # combine train and test
    all_data = train.append(test)
    all_data.fillna(-1, inplace=True)
    all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
    all_data['Response'] = all_data['Response'].astype(int)

    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()
    train_labels = train["Response"]
    
    #train_labels = train_labels.reshape(-1,1) #need to reshape b/c one dim array
    
    train_ids = train["Id"].values
    test_ids = test["Id"].values

    return  train.drop(columns_to_drop, axis=1), \
            train_labels, \
            test.drop(columns_to_drop, axis=1),\
            train_ids, \
            test_ids
            

def write_submission(preds, output):
    sample = pd.read_csv('../data/sampleSubmission.csv')
    preds = pd.DataFrame(
        preds, index=sample.id.values, columns=sample.columns[1:])
    preds.to_csv(output, index_label='id')


def score(params):
    #change this around and try it with cross_val_score? so that i can have some cross validation?
    print "Training with params : "
    print params
    #num_round = int(params['n_estimators'])
    #del params['n_estimators']
    #dtrain = xgb.DMatrix(X_train, label=y_train)
    #dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    #model = xgb.train(params, dtrain, num_round)
    #predictions = model.predict(dvalid)
    
    model = Classifier(params)
    predictions = model.predict(X_test)
    
    
    score = -eval_wrapper(predictions, y_test)
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    space = {
            'learning_rate': (0.0001, 0.5),
            'hidden0__units': (2, 200),
            'hidden0__type': ["Linear", "Rectifier", "Sigmoid", "Tanh"],
            'n_iter' : (5, 1000)
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best


X, y, _, _, _ = load_data()
print "Splitting data into train and valid ...\n\n"
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

Splitting data into train and valid ...


Training with params : 
{'hidden0__units': (2, 200), 'learning_rate': (0.0001, 0.5), 'hidden0__type': ('Linear', 'Rectifier', 'Sigmoid', 'Tanh'), 'n_iter': (5, 1000)}


AssertionError: Specify each layer as an instance of a `sknn.mlp.Layer` object.

In [32]:
#trying with randomsearchcv

#using randomizedsearchCV takes much less time
#try using MOE? how?

#begin developing structure of the presentation and beef up on what the hell XGBoost, QWK, fmin_powell, and the rest are doing


from sklearn.grid_search import RandomizedSearchCV
import numpy as np
import pandas as pd
from scipy.stats import randint
from sknn.mlp import Classifier, Layer, Regressor
from time import time
from operator import itemgetter
from sklearn.preprocessing import MinMaxScaler



def load_data(path_train = DATA_TRAIN_PATH, path_test = DATA_TEST_PATH):
    columns_to_drop = ['Id', 'Response']
    num_classes = 8

    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    # combine train and test
    all_data = train.append(test)
    all_data.fillna(-1, inplace=True)
    all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
    all_data['Response'] = all_data['Response'].astype(int)

    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()
    train_labels = train["Response"]
    
    #train_labels = train_labels.reshape(-1,1) #need to reshape b/c one dim array
    
    train_ids = train["Id"].values
    test_ids = test["Id"].values

    return  train.drop(columns_to_drop, axis=1), \
            train_labels, \
            test.drop(columns_to_drop, axis=1),\
            train_ids, \
            test_ids
            
            

#using something other than randint... we don't want to round numbers here (for some cases)

param_dist = {'learning_rate': [0.3, 0.1, 0.05, ], #0.01, 0.005, 0.001, 0.0005, 0.0001],
              'hidden0__units': [4, 8, 12], #, 25, 50, 100, 200],
              'hidden0__type': ["Rectifier", "Sigmoid", "Tanh"],
              'hidden1__units' : [4, 8, 12],
              'hidden1__type': ['Rectifier', 'Sigmoid', 'Tanh']
             }

    
    
clf = Classifier(layers=[Layer("Rectifier", units=100),
                         Layer("Rectifier", units=100),
                         Layer("Softmax")],
                 learning_rate=0.01,
                 n_iter=10)





# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")
        

        
        
        
        
        
X, y, _, _, _ = load_data()
print "Splitting data into train and valid ...\n\n"
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)


X_train_minmax = MinMaxScaler().fit_transform(X_train)

        
# run randomized search
n_iter_search = 3
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=3)

start = time()


random_search.fit(X_train_minmax, y_train)

#clf.fit(X_train_minmax, y_train)


print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)





Splitting data into train and valid ...


RandomizedSearchCV took 256.84 seconds for 3 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.475 (std: 0.017)
Parameters: {'hidden1__type': 'Sigmoid', 'learning_rate': 0.05, 'hidden0__type': 'Rectifier', 'hidden1__units': 12, 'hidden0__units': 8}

Model with rank: 2
Mean validation score: 0.418 (std: 0.022)
Parameters: {'hidden1__type': 'Tanh', 'learning_rate': 0.1, 'hidden0__type': 'Tanh', 'hidden1__units': 4, 'hidden0__units': 4}

Model with rank: 3
Mean validation score: 0.328 (std: 0.000)
Parameters: {'hidden1__type': 'Sigmoid', 'learning_rate': 0.3, 'hidden0__type': 'Rectifier', 'hidden1__units': 4, 'hidden0__units': 4}

