In [1]:
import numpy as np
import pandas as pd
import gc, os

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from multiprocessing import cpu_count
from tqdm import tqdm # make loops show a smart progress meter.

from __future__ import print_function

from sklearn import datasets
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import lightgbm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import zipfile
!unzip '/content/drive/MyDrive/santander-customer-transaction-prediction.zip'

Archive:  /content/drive/MyDrive/santander-customer-transaction-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Feature Engineering

#### Count Encoding of Features

In [6]:
for feature in range(200):
    feat = 'var_' + str(feature)
    count_values = train.groupby(feat)[feat].count()
    train['new_' + feat] = count_values.loc[train[feat]].values
    #test['new_' + feat] = count_values.loc[test[feat]].values

# SVM

In [7]:
# sample down and balance data for EDA see if it's any good
train2 = train.groupby('target', group_keys=False).apply(lambda x: x.sample(min(len(x), 500), random_state=1994))
train2["target"].value_counts()

1    500
0    500
Name: target, dtype: int64

In [8]:
X = train2.iloc[:, 2:]
y = train2.target

x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['roc_auc']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring=score)
    clf.fit(x, y)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print(clf.score)
    y_true, y_pred = y_test, clf.predict(x_test)
    fpr , tpr , thresholds = roc_curve(y_true,y_pred)

    auc_scoreSVM=roc_auc_score(y_true,y_pred)
auc_scoreSVM

# Tuning hyper-parameters for roc_auc

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}
<bound method BaseSearchCV.score of GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)>


0.655

# Train Test Split for NB and LightGBM

In [9]:
X = train.iloc[:, 2:]
y = train.target

x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Guassian NB

In [10]:
gnb = GaussianNB()
y_true = y_test
y_pred = gnb.fit(x, y).predict(x_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (x_test.shape[0], (y_test != y_pred).sum()))

print(classification_report(y_true, y_pred))

fpr , tpr , thresholds = roc_curve(y_true,y_pred)

auc_scoreGNB=roc_auc_score(y_true,y_pred)
auc_scoreGNB

Number of mislabeled points out of a total 40000 points : 3999
              precision    recall  f1-score   support

           0       0.93      0.96      0.95     36096
           1       0.48      0.37      0.42      3904

    accuracy                           0.90     40000
   macro avg       0.71      0.66      0.68     40000
weighted avg       0.89      0.90      0.89     40000



0.6615643802319499

# Light GBM

In [11]:
train_data = lightgbm.Dataset(x, label=y)
test_data = lightgbm.Dataset(x_test, label=y_test)

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 5,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)
#
# Create a submission
#
#submission = pd.read_csv("test.csv")
submission = test.copy()
ids = submission['ID_code'].values
submission.drop('ID_code', inplace=True, axis=1)
y = model.predict(submission)

output = pd.DataFrame({'ID_code': ids, 'target': y})
output.to_csv("submission.csv", index=False)

[1]	valid_0's auc: 0.614126
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.654866
[3]	valid_0's auc: 0.670686
[4]	valid_0's auc: 0.682607
[5]	valid_0's auc: 0.697463
[6]	valid_0's auc: 0.701609
[7]	valid_0's auc: 0.705562
[8]	valid_0's auc: 0.709792
[9]	valid_0's auc: 0.71247
[10]	valid_0's auc: 0.720995
[11]	valid_0's auc: 0.724601
[12]	valid_0's auc: 0.726196
[13]	valid_0's auc: 0.728525
[14]	valid_0's auc: 0.731117
[15]	valid_0's auc: 0.730544
[16]	valid_0's auc: 0.732521
[17]	valid_0's auc: 0.73418
[18]	valid_0's auc: 0.734888
[19]	valid_0's auc: 0.736628
[20]	valid_0's auc: 0.737821
[21]	valid_0's auc: 0.740959
[22]	valid_0's auc: 0.741649
[23]	valid_0's auc: 0.745686
[24]	valid_0's auc: 0.747881
[25]	valid_0's auc: 0.749465
[26]	valid_0's auc: 0.751893
[27]	valid_0's auc: 0.752298
[28]	valid_0's auc: 0.75571
[29]	valid_0's auc: 0.757026
[30]	valid_0's auc: 0.758513
[31]	valid_0's auc: 0.760476
[32]	valid_0's auc: 0.760909
[33]	valid_0's auc: 0