In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [2]:
from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV



In [3]:
train_data=pd.read_csv('data/criminal_train.csv')
test_data=pd.read_csv('data/criminal_test.csv')
columns=list(train_data)

In [5]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(train_data[columns[1:-1]], train_data[columns[-1]])
train_x, test_x, train_y, test_y =train_test_split(x_res, y_res, test_size=0.2, random_state=12)



In [6]:
xgb_model = xgb.XGBClassifier()

In [7]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(train_y, n_folds=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(train_x, train_y)
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 




[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 




[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 
[CV] colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 




[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 -   1.2s
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 -   1.3s
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 -   1.4s


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.3s


[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 -   1.2s
[CV]  colsample_bytree=0.7, silent=1, missing=-999, learning_rate=0.05, nthread=4, min_child_weight=11, n_estimators=5, subsample=0.8, seed=1337, objective=binary:logistic, max_depth=6 -   1.4s


[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.8s finished


('Raw AUC score:', 0.9919616004150215)
colsample_bytree: 0.7
learning_rate: 0.05
max_depth: 6
min_child_weight: 11
missing: -999
n_estimators: 5
nthread: 4
objective: 'binary:logistic'
seed: 1337
silent: 1
subsample: 0.8


In [10]:
test_df=test_data[columns[1:-1]]
test_df.columns=['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69']
res = clf.predict(test_df)
submission = pd.DataFrame({ 'PERID': test_data['PERID'],'Criminal': res })
submission[['PERID','Criminal']].to_csv('sub.csv', index=False)