In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import xgboost as xgb


%matplotlib inline
plt.style.use('ggplot')

In [2]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv").drop("Unnamed: 0", axis=1)
train_target = pd.read_csv("train_target.csv")

print("Shapes of data:", train_data.shape, test_data.shape, train_target.shape)
print("Proportion train/test: ", int(train_data.shape[0]/test_data.shape[0]))

Shapes of data: (27594, 20) (13593, 20) (27594, 1)
Proportion train/test:  2


### Preprocessing

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from pandas import get_dummies

In [4]:
le = LabelEncoder()
train_data = train_data.apply(le.fit_transform)


In [5]:
train_data = pd.get_dummies(train_data)

In [6]:
train_data, train_target = np.array(train_data), np.array(train_target)
c, r = train_target.shape
train_target = train_target.reshape(c,)

### XGBoost

In [7]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

In [235]:
def xgb_grid_search(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=442)

    gbm_params = {"n_estimators": [100, 150, 200, 300, 350], 
                  "max_depth" : [3, 5, 7, 10], 
                  "reg_lambda" : [1, 2, 3, 5]}

    print("GridSeachCV proceeding...")
    gbm = GridSearchCV(xgb.XGBClassifier(), gbm_params, n_jobs=3, cv=3, verbose=2)
    gbm.fit(X_train, y_train)
    print("Done.")
    
    print(101*"="+ "\nBEST PARAMETERS: ", gbm.best_params_, "\n"+101*"="+"\n")
    predictions = gbm.best_estimator_.predict(X_test)
    
    
    print("==================\nACCURACY : %.4g\n==================" % accuracy_score(y_test, predictions))
    return pd.Seriespredictions
    

In [236]:
xgb_grid_search(train_data, train_target)

GridSeachCV proceeding...
Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV] reg_lambda=1, n_estimators=100, max_depth=3 .....................
[CV] reg_lambda=1, n_estimators=100, max_depth=3 .....................
[CV] reg_lambda=1, n_estimators=100, max_depth=3 .....................
[CV] ............ reg_lambda=1, n_estimators=100, max_depth=3 -   5.4s
[CV] reg_lambda=2, n_estimators=100, max_depth=3 .....................
[CV] ............ reg_lambda=1, n_estimators=100, max_depth=3 -   5.6s
[CV] reg_lambda=2, n_estimators=100, max_depth=3 .....................
[CV] ............ reg_lambda=1, n_estimators=100, max_depth=3 -   6.2s
[CV] reg_lambda=2, n_estimators=100, max_depth=3 .....................
[CV] ............ reg_lambda=2, n_estimators=100, max_depth=3 -   5.4s
[CV] reg_lambda=3, n_estimators=100, max_depth=3 .....................
[CV] ............ reg_lambda=2, n_estimators=100, max_depth=3 -   4.7s
[CV] reg_lambda=3, n_estimators=100, max_depth=3 ...........

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.1min


[CV] ............ reg_lambda=5, n_estimators=200, max_depth=3 -  18.2s
[CV] reg_lambda=1, n_estimators=300, max_depth=3 .....................
[CV] ............ reg_lambda=1, n_estimators=300, max_depth=3 -  21.3s
[CV] reg_lambda=2, n_estimators=300, max_depth=3 .....................
[CV] ............ reg_lambda=1, n_estimators=300, max_depth=3 -  21.1s
[CV] reg_lambda=2, n_estimators=300, max_depth=3 .....................
[CV] ............ reg_lambda=1, n_estimators=300, max_depth=3 -  21.9s
[CV] reg_lambda=2, n_estimators=300, max_depth=3 .....................
[CV] ............ reg_lambda=2, n_estimators=300, max_depth=3 -  23.1s
[CV] reg_lambda=3, n_estimators=300, max_depth=3 .....................
[CV] ............ reg_lambda=2, n_estimators=300, max_depth=3 -  23.5s
[CV] reg_lambda=3, n_estimators=300, max_depth=3 .....................
[CV] ............ reg_lambda=2, n_estimators=300, max_depth=3 -  23.9s
[CV] reg_lambda=3, n_estimators=300, max_depth=3 .....................
[CV] .

[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 19.3min


[CV] ............ reg_lambda=1, n_estimators=300, max_depth=7 -  52.5s
[CV] reg_lambda=2, n_estimators=300, max_depth=7 .....................
[CV] ............ reg_lambda=1, n_estimators=300, max_depth=7 -  53.6s
[CV] reg_lambda=2, n_estimators=300, max_depth=7 .....................
[CV] ............ reg_lambda=1, n_estimators=300, max_depth=7 -  54.0s
[CV] reg_lambda=2, n_estimators=300, max_depth=7 .....................
[CV] ............ reg_lambda=2, n_estimators=300, max_depth=7 -  53.2s
[CV] reg_lambda=3, n_estimators=300, max_depth=7 .....................
[CV] ............ reg_lambda=2, n_estimators=300, max_depth=7 -  55.4s
[CV] reg_lambda=3, n_estimators=300, max_depth=7 .....................
[CV] ............ reg_lambda=2, n_estimators=300, max_depth=7 -  56.8s
[CV] reg_lambda=3, n_estimators=300, max_depth=7 .....................
[CV] ............ reg_lambda=3, n_estimators=300, max_depth=7 -  52.4s
[CV] reg_lambda=5, n_estimators=300, max_depth=7 .....................
[CV] .

[Parallel(n_jobs=3)]: Done 240 out of 240 | elapsed: 45.2min finished


Done.
BEST PARAMETERS:  {'reg_lambda': 1, 'n_estimators': 100, 'max_depth': 3} 

ACCURACY : 0.8893


AttributeError: module 'pandas' has no attribute 'Seriespredictions'