Here is a simple SVM using Stochastic Gradient Descent with `loss = 'hinge'`. Probabilities were not used in the prediction and transactions were predicted with high accuracy at the expense of false positives.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("../input"))

['train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [3]:
X = pd.read_csv("../input/train.csv")
print('\n shape of raw training:', X.shape)

known_ids = X['ID_code']
y = X['target']
X = X.drop(['target', 'ID_code'], axis=1).values
print('\n shape of ids:', known_ids.shape)
print('\n shape of labels:', y.shape)
print('\n shape of training data:', X.shape)
print("\n train data loaded!")


 shape of raw training: (200000, 202)

 shape of ids: (200000,)

 shape of labels: (200000,)

 shape of training data: (200000, 200)

 train data loaded!


In [4]:
def my_cv(X, y, model, folds=5, rand_st=1):
    scores = []
    for i in range(1,folds+1):
        print('\n fold: ', i)
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=1/folds, 
                                                            random_state=i+rand_st)
        # scaler = MinMaxScaler(feature_range=(-1,1))
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        model.fit(X_train, y_train)
        model_pred = model.predict(X_test)
        # model_pred_proba =model.predict_proba(X_test)
        scores.append(roc_auc_score(model_pred,y_test))
        if i == 1:
            conf_mat = confusion_matrix(y_test, model_pred)
        else:
            conf_mat += confusion_matrix(y_test, model_pred)
    print('\n',scores,'\n',conf_mat)
    return scores, conf_mat  

In [5]:
model = SGDClassifier(loss='hinge',
                      class_weight='balanced', 
                      penalty = 'l1',
                      l1_ratio = 0.7,
                      alpha = 5e-4, 
                      max_iter = 1000,
                      early_stopping=True,
                      tol = 1e-3,
                      n_iter_no_change = 10,
                      verbose=1)
scores, conf_mat = my_cv(X, y, model, folds=5)
print('class 0 accuracy: ', conf_mat[0,0]/sum(conf_mat[0,]))
print('class 1 accuracy: ', conf_mat[1,1]/sum(conf_mat[1,]))


 fold:  1
-- Epoch 1
Norm: 335.87, NNZs: 105, Bias: -1.805509, T: 144000, Avg. loss: 19.225213
Total training time: 0.29 seconds.
-- Epoch 2
Norm: 337.92, NNZs: 108, Bias: -1.503145, T: 288000, Avg. loss: 0.872773
Total training time: 0.71 seconds.
-- Epoch 3
Norm: 338.60, NNZs: 137, Bias: -1.186787, T: 432000, Avg. loss: 0.711855
Total training time: 1.14 seconds.
-- Epoch 4
Norm: 338.91, NNZs: 130, Bias: -1.067306, T: 576000, Avg. loss: 0.647966
Total training time: 1.57 seconds.
-- Epoch 5
Norm: 339.09, NNZs: 132, Bias: -1.089144, T: 720000, Avg. loss: 0.607797
Total training time: 2.01 seconds.
-- Epoch 6
Norm: 339.21, NNZs: 127, Bias: -0.918743, T: 864000, Avg. loss: 0.584571
Total training time: 2.45 seconds.
-- Epoch 7
Norm: 339.29, NNZs: 142, Bias: -1.015596, T: 1008000, Avg. loss: 0.562916
Total training time: 2.88 seconds.
-- Epoch 8
Norm: 339.35, NNZs: 148, Bias: -0.972852, T: 1152000, Avg. loss: 0.553626
Total training time: 3.31 seconds.
-- Epoch 9
Norm: 339.40, NNZs: 145

In [6]:
print(scores)
print('\n', sum(scores)/len(scores))
print('\n', conf_mat)

[0.6125586358103912, 0.6092244591528602, 0.6138450976905905, 0.6137513602181197, 0.6111098781785751]

 0.6120978862101073

 [[135022  44839]
 [  4618  15521]]


In [7]:
test = pd.read_csv('../input/test.csv')
submit = test[['ID_code']]
X_test = test.drop(columns=['ID_code'])

scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

model.fit(X,y)
preds = model.predict(X_test)

submit['target'] = preds
print(submit.head)

-- Epoch 1
Norm: 178.30, NNZs: 150, Bias: -2.462789, T: 180000, Avg. loss: 1.690845
Total training time: 0.41 seconds.
-- Epoch 2
Norm: 178.67, NNZs: 158, Bias: -2.334855, T: 360000, Avg. loss: 0.500543
Total training time: 0.93 seconds.
-- Epoch 3
Norm: 178.83, NNZs: 165, Bias: -2.314971, T: 540000, Avg. loss: 0.482378
Total training time: 1.45 seconds.
-- Epoch 4
Norm: 178.93, NNZs: 169, Bias: -2.311544, T: 720000, Avg. loss: 0.475773
Total training time: 1.96 seconds.
-- Epoch 5
Norm: 179.00, NNZs: 175, Bias: -2.182256, T: 900000, Avg. loss: 0.469999
Total training time: 2.48 seconds.
-- Epoch 6
Norm: 179.05, NNZs: 170, Bias: -2.275657, T: 1080000, Avg. loss: 0.468195
Total training time: 2.96 seconds.
-- Epoch 7
Norm: 179.10, NNZs: 171, Bias: -2.197871, T: 1260000, Avg. loss: 0.466247
Total training time: 3.46 seconds.
-- Epoch 8
Norm: 179.14, NNZs: 173, Bias: -2.237754, T: 1440000, Avg. loss: 0.468074
Total training time: 3.96 seconds.
-- Epoch 9
Norm: 179.17, NNZs: 175, Bias: -2.

In [None]:
submit.to_csv('sgdclassifier.csv', index=False)