In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
import xgboost as xgb
import numpy
import os
import pandas
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from scipy import stats

In [0]:
seed = 7
numpy.random.seed(seed)
base = 'my_drive'+os.sep+'ML_Project'+os.sep

In [0]:
X = pandas.read_csv(base+"X_train.txt", delim_whitespace=True, header=None)
Y = pandas.read_csv(base+"Y_train.txt", delim_whitespace=True, header=None)
X_test = pandas.read_csv(base+"X_test.txt", delim_whitespace=True, header=None)

In [0]:
import copy
X_backup = copy.deepcopy(X)
Y_backup = copy.deepcopy(Y)

In [0]:
Y.columns.values[0] = '14'

In [0]:
merged = pandas.concat([X, Y],axis=1)
X = merged[list(merged.columns[0:14])]

In [0]:
merged['target'] = [1 if x >0 else 0 for x in merged[14]]

In [0]:
xgb_model = xgb.XGBClassifier(verbose=1)

parameters = {
              'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
#              'silent': [1],
#              'subsample': [0.8],
#              'colsample_bytree': [0.7],
              'n_estimators': [30], #number of trees, change it to 1000 for better results
#              'missing':[-999],
              'seed': [seed]}

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=5,
                   scoring='roc_auc',
                   verbose=2, refit=True)

In [0]:
X = X.values
Y_train = merged['target'].values

In [30]:
pandas.DataFrame(X).sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
94353,238.0,227.0,240.39,231.77,994.0,273.0,0.0,2.1059,6.8253,1.7253,3.7619,1.959,2.2388,0.0
24453,253.0,238.0,249.25,249.25,296.0,0.0,0.0,2.4404,3.3366,0.0,5.264,3.168,20.0,0.0
73613,234.0,228.0,241.46,232.61,209.0,59.0,0.0,4.4445,7.0692,2.1063,7.9526,2.2772,7.5729,8.4
72650,253.0,232.0,246.1,233.69,271.0,16.0,0.0,6.1636,5.2686,1.1022,7.131,2.7122,20.0,0.0
29065,243.0,238.0,247.44,247.44,1721.0,0.0,0.0,6.728,3.9723,0.0,3.7081,2.1488,20.0,19.1
91319,245.0,236.0,245.29,245.29,285.0,0.0,0.0,1.6046,4.823,0.0,6.8349,3.2673,20.0,0.0
76194,231.0,220.0,236.41,220.0,2442.0,1282.0,9.0,3.9999,8.2376,3.4726,5.3657,2.6669,2.1862,0.0
4937,241.0,232.0,244.08,233.45,766.0,43.0,0.0,2.01,5.1402,0.86813,3.6086,1.9472,4.2721,0.0
85607,253.0,242.0,248.13,248.13,62.0,0.0,0.0,5.771,3.6327,0.0,8.3566,6.1269,20.0,0.0
22907,232.0,218.0,236.91,219.38,2257.0,1097.0,52.0,1.172,10.654,4.2668,2.7943,2.7014,1.2573,0.5


In [31]:
Y.sample(10)

Unnamed: 0,14
2982,0.5
48102,0.2
89032,0.0
73654,0.1
89258,0.1
89474,0.0
91839,37.4
17438,0.0
33854,0.1
63651,0.0


In [33]:
pandas.DataFrame(Y_train).sample(10)

Unnamed: 0,0
74188,0
53471,0
69200,0
29693,0
65792,0
99681,0
47727,1
32782,0
88221,0
86117,1


In [38]:
clf.fit(X,Y_train,verbose=2)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV]  learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7, total=  34.3s
[CV]  learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7, total=  35.0s


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   37.9s remaining:   56.8s


[CV]  learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7, total=  35.4s
[CV]  learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7, total=  36.0s
[CV]  learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7, total=  34.8s


[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   38.7s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   38.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, verbose=1),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'nthread': [4], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'max_depth': [6], 'min_child_weight': [11], 'n_estimators': [30], 'seed': [7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [0]:
X_test_backup = copy.deepcopy(X_test)

In [50]:
pred_cl = clf.predict(X)

  if diff:


In [0]:
merged['regress'] =  [x for x in pred_cl]

In [0]:
bb = merged.values

In [0]:
bb[:,16]
ix = numpy.isin(bb[:,16],1)

In [0]:
merged_regress = bb[bb[:,16] > 0]

In [0]:
X_regress = merged_regress[0:,:15]
Y_regress = merged_regress[0:,15:16]

In [0]:
xgb_regress_model = xgb.XGBRegressor(verbose=1)

parameters = {
              'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
#              'silent': [1],
#              'subsample': [0.8], 
#              'colsample_bytree': [0.7],
              'n_estimators': [30], #number of trees, change it to 1000 for better results
#              'missing':[-999],
              'seed': [seed]}

reg = GridSearchCV(xgb_regress_model, parameters, n_jobs=5, 
                   cv=5,
                   scoring='roc_auc',
                   verbose=2, refit=True)

In [0]:
reg.fit(X_regress,Y_regress)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
[CV] learning_rate=0.05, max_depth=6, min_child_weight=11, n_estimators=30, nthread=4, objective=binary:logistic, seed=7 
