In [21]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [2]:
def random_forest(train_x,train_y,test_x,test_y):
    clf = RandomForestClassifier()
    clf.fit(train_x, train_y)
    prediction=clf.predict(test_x)
    print accuracy_score(test_y,prediction)
    print confusion_matrix(test_y,prediction)
    print matthews_corrcoef(test_y,prediction)
    print recall_score(test_y,prediction)
    print roc_auc_score(test_y,prediction)
    return clf

In [18]:
def xgboost(train_x, train_y, test_x, test_y):
    #clf=xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.01) 82%
    clf=xgb.XGBClassifier(learning_rate =0.015,max_depth=5,n_estimators=500,min_child_weight=0.7,gamma=0.2,subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic',n_jobs=4,scale_pos_weight=1,random_state=27)
    clf.fit(train_x, train_y)
    prediction = clf.predict(test_x)
    print accuracy_score(test_y,prediction)
    print confusion_matrix(test_y,prediction)
    print matthews_corrcoef(test_y,prediction)
    print recall_score(test_y,prediction)
    print roc_auc_score(test_y,prediction)
    return clf

In [25]:
def neural(train_x, train_y, test_x, test_y):
    clf=MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    clf.fit(train_x, train_y)
    prediction = clf.predict(test_x)
    print accuracy_score(test_y,prediction)
    print confusion_matrix(test_y,prediction)
    print matthews_corrcoef(test_y,prediction)
    print recall_score(test_y,prediction)
    print roc_auc_score(test_y,prediction)
    return clf

In [4]:
train_data=pd.read_csv('data/criminal_train.csv')
test_data=pd.read_csv('data/criminal_test.csv')
columns=list(train_data)

In [5]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(train_data[columns[1:-1]], train_data[columns[-1]])



In [6]:
#pca = PCA(n_components=60)
#comp_x = pca.fit(x_res).transform(x_res)

In [6]:
train_x, test_x, train_y, test_y =train_test_split(x_res, y_res, test_size=0.2, random_state=12)

In [8]:
#rf_clf=random_forest(train_x,train_y,test_x,test_y)

0.967387472088
[[8166  382]
 [ 173 8297]]
0.935062970396
0.979574970484
0.967443077193


In [19]:
#xg_clf=xgboost(train_x, train_y, test_x, test_y)

0.972969796686
[[8265  283]
 [ 177 8293]]
0.946014946371
0.979102715466
0.972997777948


In [23]:
ne_clf=neural(train_x, train_y, test_x, test_y)

0.764014572805
[[8134  414]
 [3602 4868]]
0.56878706656
0.574734356553
0.763150987354


In [20]:
#comp_test=pca.transform(test_data[columns[1:-1]])
test_df=test_data[columns[1:-1]]
test_df.columns=['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69']
res = xg_clf.predict(test_df)
submission = pd.DataFrame({ 'PERID': test_data['PERID'],'Criminal': res })
submission[['PERID','Criminal']].to_csv('sub.csv', index=False)