In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from datetime import datetime
from sklearn.svm import LinearSVC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [25]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes adn %s seconds.' %(thour, tmin, round(tsec, 2)))

In [26]:
DATA_TRAIN_PATH = './train.csv'
DATA_TEST_PATH = './test.csv'

In [27]:
def load_data(path_train = DATA_TRAIN_PATH, path_test = DATA_TEST_PATH):
    train = pd.read_csv(path_train)
    print('\n Shape of train data:', train.shape)
    train_labels = train['target'].values
    train_ids = train['ID_code'].values
    train = train.drop(['target', 'ID_code'], axis=1).values
    
    test = pd.read_csv(path_test)
    print('\n Shape of test data:', test.shape)
    test_ids = test['ID_code'].values
    test = test.drop(['ID_code'], axis=1).values
    
    return train, train_labels, test, train_ids, test_ids

In [28]:
starttime = timer(None)
X_train, y_train, X_test, train_ids, test_ids = load_data()
seq_ids = np.arange(train_ids.shape[0])


 Shape of train data: (200000, 202)

 Shape of test data: (200000, 201)


In [29]:
folds = 5
cv_sum = 0
pred = []
fpred = []

avreal =y_train
avpred = np.zeros(X_train.shape[0])
idpred = train_ids

In [31]:
train_time = timer(None)
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=0)

for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    start_time = timer(None)
    print('\n Fold %02d' % (i+1))
    Xtrain, Xval = X_train[train_index], X_train[val_index]
    ytrain, yval = y_train[train_index], y_train[val_index]
    model = LinearSVC(C=0.01, tol=0.0001, verbose=1, random_state=1001, max_iter=2000, dual=False)
    isotonic = CalibratedClassifierCV(model, cv=5, method='isotonic')
    sigmoid = CalibratedClassifierCV(model, cv=5, method='sigmoid')
    
#     isotonic.fit(Xtrain, ytrain)
    sigmoid.fit(Xtrain,ytrain)
#     scores_val = isotonic.predict_proba(Xval)[:, 1]
    scores_val = sigmoid.predict_proba(Xval)[:, 1]
    ROC_AUC = roc_auc_score(yval, scores_val)
    print(' Fold %02d AUC: %.6f' % ((i+1), ROC_AUC))
    
#     y_pred = isotonic.predict_proba(X_test)[:, 1]
    y_pred = sigmoid.predict_proba(X_test)[:, 1]
    timer(start_time)
    
    avpred[val_index] = scores_val
    
    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + ROC_AUC

timer(train_time)

cv_score = (cv_sum / folds)
oof_ROC_AUC = roc_auc_score(avreal, avpred)
print('\n Average AUC: %.6f' % cv_score)
print(' Out-of-fold AUC: %.6f' % oof_ROC_AUC)
score = str(round(oof_ROC_AUC, 6)).replace('.', '')
mpred = pred / folds

now = datetime.now()
oof_result = pd.DataFrame(avreal, columns=['target'])
oof_result['prediction'] = avpred
oof_result['ID_code'] = idpred
oof_result['seq_id'] = seq_ids
oof_result.sort_values('seq_id', ascending=True, inplace=True)
oof_result = oof_result[['ID_code', 'target', 'prediction']]
sub_file = 'train_5x-LinearSVC-01-v1-oof_' + score + '_' + str(now.strftime('%Y-%m-%d-%H-%M')) + '.csv'
print('\n Writing out-of-fold train file::  %s' % sub_file)
oof_result.to_csv(sub_file, index=False, float_format='%.6f')


 Fold 01
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear] Fold 01 AUC: 0.858162

 Time taken: 0 hours 9 minutes adn 18.38 seconds.

 Fold 02
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear] Fold 02 AUC: 0.858949

 Time taken: 0 hours 8 minutes adn 2.7 seconds.

 Fold 03
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear] Fold 03 AUC: 0.858047

 Time taken: 0 hours 7 minutes adn 6.2 seconds.

 Fold 04
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear] Fold 04 AUC: 0.862615

 Time taken: 0 hours 7 minutes adn 33.42 seconds.

 Fold 05
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear] Fold 05 AUC: 0.857252

 Time taken: 0 hours 6 minutes adn 51.67 seconds.

 Time taken: 0 hours 38 minutes adn 52.47 seconds.

 Average AUC: 0.859005
 Out-of-fold AUC: 0.859008

 Writing out-of-fold train file::  train_5x-LinearSVC-01-v1-oof_0859008_2019-03-22-01-47.csv


NameError: name 'te_ids' is not defined

In [32]:
result = pd.DataFrame(mpred, columns=['target'])
result['ID_code'] = test_ids
result = result[['ID_code', 'target']]
print('\n First 10 lines of your prediction:')
print(result.head(10))

sub_file = 'submission_5x-LinearSVC-01-v1_' + score + '_' + str(now.strftime('%Y-%m-%d-%H-%M')) + '.csv'
print('\n Writing submission:  %s' % sub_file)
result.to_csv(sub_file, index=False, float_format='%.6f')
timer(starttime)


 First 10 lines of your prediction:
  ID_code    target
0  test_0  0.188497
1  test_1  0.234322
2  test_2  0.040344
3  test_3  0.184748
4  test_4  0.079567
5  test_5  0.002633
6  test_6  0.005950
7  test_7  0.184337
8  test_8  0.002407
9  test_9  0.008053

 Writing submission:  submission_5x-LinearSVC-01-v1_0859008_2019-03-22-01-47.csv

 Time taken: 0 hours 56 minutes adn 0.39 seconds.
