In [1]:
import os
import datetime as dt
import time
import json

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn import metrics

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
today = dt.date.today()

In [3]:
PK = 'sk_id_curr'
TARGET = 'target'
N_CV = 3
N_COMPONENTS = 150
SEED = 1111
DATA_DIR = 'clean_data/'

TRAIN_FILE = os.path.join(DATA_DIR, 'mrgd_train.csv')
TEST_FILE = os.path.join(DATA_DIR, 'mrgd_test.csv')
SUBMISSION_OUTPUT_FILE = os.path.join(DATA_DIR, 'submission_out.csv')

DTYPES = {'sk_id_curr':str, 'sk_id_bureau':str, 'sk_id_prev':str,'num_instalment_version':str}



## Agenda:
    - Load Data
    - Create 150 features from PCA, SVD
    - Fit Kmeans Cluster Labels
    - Fit Naive Bayes Model to data and use predictions as ftrs
    - Save all datasets
        - Going to model each individually
        - Then together
        - Then run through feature seleciton...
        - Then, if time, polynomial features the resulting selected features and then trim those
 
 Will see how compute intensive this ends up being and scale back accordingly

### Load Data

In [4]:
train = pd.read_csv(TRAIN_FILE, dtype=DTYPES)
test = pd.read_csv(TEST_FILE, dtype=DTYPES)
train.shape, test.shape

((307511, 527), (48744, 526))

In [5]:
train.head()

Unnamed: 0,sk_id_curr,flag_own_car,flag_own_realty,name_contract_type,flag_cont_mobile,flag_document_10,flag_document_11,flag_document_12,flag_document_13,flag_document_14,...,sk_dpd_pos,name_contract_status_Active,name_contract_status_Amortized debt,name_contract_status_Approved_pos,name_contract_status_Canceled_pos,name_contract_status_Completed_pos,name_contract_status_Demand_pos,name_contract_status_Returned to the store,name_contract_status_Signed_pos,target
0,100002,0,1,0,1,0,0,0,0,0,...,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,100003,0,0,0,1,0,0,0,0,0,...,0.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0
2,100004,1,1,1,1,0,0,0,0,0,...,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,100006,0,1,0,1,0,0,0,0,0,...,0.0,18.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0
4,100007,0,1,0,1,0,0,0,0,0,...,0.0,62.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0


In [6]:
# Target var
y_train = train[TARGET].values
# sk ids
train_id = train[PK]
test_id = test[PK]

train.drop([PK, TARGET], axis=1, inplace=True)
test.drop(PK, axis=1, inplace=True)
print("{},{}".format(train.shape, test.shape))

x_train = train.values
x_test = test.values

(307511, 525),(48744, 525)


In [7]:
test.columns[test.isnull().any()]

Index([], dtype='object')

In [8]:
rscaler = RobustScaler()
rscaler.fit(x_train)

x_train_scaled = rscaler.transform(x_train)
x_test_scaled = rscaler.transform(x_test)

### Matrix Factorization

In [10]:
# PCA
pca = PCA(N_COMPONENTS, random_state=SEED)
pca.fit(x_train_scaled)

pca_train = pca.transform(x_train_scaled)
pca_test = pca.transform(x_test_scaled)
pca_train.shape, pca_test.shape

((307511, 150), (48744, 150))

In [11]:
# SVD 
svd = TruncatedSVD(N_COMPONENTS, random_state=SEED)
svd.fit(x_train_scaled)

svd_train = svd.transform(x_train_scaled)
svd_test = svd.transform(x_test_scaled)
svd_train.shape, svd_test.shape

((307511, 150), (48744, 150))

### Clustering

In [12]:
from sklearn.linear_model import LogisticRegression

#### Kmeans

In [13]:
# Iterated through various n_clusters and batch_size
# best auc is n_clusters = 60 batch_size=2500 with auc of 0.0.558
nclust = 60
bsize = 2500

mbkm = MiniBatchKMeans(nclust, batch_size=bsize, verbose=1, n_init=10, random_state=SEED)
km_train = mbkm.fit_transform(x_train_scaled)
km_test = mbkm.transform(x_test_scaled)

Init 1/10 with method: k-means++
Inertia for init 1/10: 4826323782041.433594
Init 2/10 with method: k-means++
Inertia for init 2/10: 2771735050209.527832
Init 3/10 with method: k-means++
Inertia for init 3/10: 15680082162777.380859
Init 4/10 with method: k-means++
Inertia for init 4/10: 8744329059667.680664
Init 5/10 with method: k-means++
Inertia for init 5/10: 8752714647388.677734
Init 6/10 with method: k-means++
Inertia for init 6/10: 25887559984659.769531
Init 7/10 with method: k-means++
Inertia for init 7/10: 8573167688360.576172
Init 8/10 with method: k-means++
Inertia for init 8/10: 8813832329565.552734
Init 9/10 with method: k-means++
Inertia for init 9/10: 9672471325251.113281
Init 10/10 with method: k-means++
Inertia for init 10/10: 21611240491127.824219
Minibatch iteration 1/12400: mean batch inertia: 9840724.404085, ewa inertia: 9840724.404085 
Minibatch iteration 2/12400: mean batch inertia: 75320656.934944, ewa inertia: 10905397.205973 
Minibatch iteration 3/12400: mean b

### Naive Bayes

In [92]:
# bern_cols = train.columns[train.apply(lambda col: col.nunique() == 2)].tolist()
# mult_cols = train.columns[(train.dtypes==int) & ~train.columns.isin(bern_cols)].tolist()
# gaus_cols = train.columns[~train.columns.isin(bern_cols+mult_cols)]
# len(bern_cols) + len(mult_cols) + len(gaus_cols), train.shape

# bern_locs = [train.columns.get_loc(b) for b in bern_cols]
# mult_locs = [train.columns.get_loc(m) for m in mult_cols]
# gaus_locs = [train.columns.get_loc(g) for g in gaus_cols]

(525, (307511, 525))

In [22]:
class NaiveBayesClf():
    """
    Uses Bernoulli NB for Binary Features,
    Multinomial NB for integer ftrs,
    Gaussian NB for all other ftrs.
    
    Final Probs is average of 3 predicted probabilities of above models.
    """
    
    def __init__(self, bparams={}, mparams={}, gparams={}):
        self.bnb = BernoulliNB(**bparams)
        self.mnb = MultinomialNB(**mparams)
        self.gnb = GaussianNB(**gparams)
        
        
    def get_cols(self, data):
        bern_cols = data.columns[data.apply(lambda col: col.nunique() == 2)].tolist()
        mult_cols = data.columns[(data.dtypes==int) & ~data.columns.isin(bern_cols)].tolist()
        gaus_cols = data.columns[~data.columns.isin(bern_cols+mult_cols)]
        assert len(bern_cols) + len(mult_cols) + len(gaus_cols) == data.shape[1]
        
        self.bern_locs = [data.columns.get_loc(b) for b in bern_cols]
        self.mult_locs = [data.columns.get_loc(m) for m in mult_cols]
        self.gaus_locs = [data.columns.get_loc(g) for g in gaus_cols]
        
    
    def fit(self, data, target):
        self.get_cols(data)
        self.bnb.fit(data.values[:,self.bern_locs], target)
        self.mnb.fit(data.values[:,self.mult_locs], target)
        self.gnb.fit(data.values[:,self.gaus_locs], target)

    
    def predict(self, new_data):
        bprobs = self.bnb.predict_proba(new_data[:,self.bern_locs])[:,1]
        mprobs = self.mnb.predict_proba(new_data[:,self.mult_locs])[:,1]
        gprobs = self.gnb.predict_proba(new_data[:,self.gaus_locs])[:,1]
        all_probs = np.vstack((bprobs, mprobs, gprobs))
        final_probs = all_probs.mean(0)

        return final_probs


In [120]:
# you should cross validate
for p in np.arange(0.1, 1, 0.1):
    print(p)
    priors = (p, 1-p)
    nbc = NaiveBayesClf({'class_prior':priors}, {'class_prior':priors}, {'priors':priors})
    nbc.fit(train, y_train)
    probs = nbc.predict(x_train)
    print(metrics.roc_auc_score(y_train, probs))
    print()

0.1
(307511,) (307511,) (307511,)
0.591584954654
0.2
(307511,) (307511,) (307511,)
0.598932452455
0.3
(307511,) (307511,) (307511,)
0.603396315046
0.4
(307511,) (307511,) (307511,)
0.606131171884
0.5
(307511,) (307511,) (307511,)
0.607403046501
0.6
(307511,) (307511,) (307511,)
0.60724274623
0.7
(307511,) (307511,) (307511,)
0.605229087143
0.8
(307511,) (307511,) (307511,)
0.600183778401
0.9
(307511,) (307511,) (307511,)
0.590787096525


In [23]:
nb_prior = (0.5, 0.5)
nbc = NaiveBayesClf({'class_prior':nb_prior}, {'class_prior':nb_prior}, {'priors':nb_prior})
nbc.fit(train, y_train)

nb_train_probs = nbc.predict(x_train)
nb_test_probs = nbc.predict(x_test)


### Combine All New Features

In [25]:
print(
    pca_train.shape,
    pca_test.shape,
    svd_train.shape,
    svd_test.shape,
    km_train.shape,
    km_test.shape,
    nb_train_probs.shape,
    nb_test_probs.shape
)

(307511, 150) (48744, 150) (307511, 150) (48744, 150) (307511, 60) (48744, 60) (307511,) (48744,)


In [29]:
pca_train_df = pd.DataFrame(pca_train, columns = ['pca_ftr_'+str(n) for n in range(pca_train.shape[1])])
pca_test_df = pd.DataFrame(pca_test, columns = ['pca_ftr_'+str(n) for n in range(pca_test.shape[1])])

svd_train_df = pd.DataFrame(svd_train, columns = ['svd_ftr_'+str(n) for n in range(svd_train.shape[1])])
svd_test_df = pd.DataFrame(svd_test, columns = ['svd_ftr_'+str(n) for n in range(svd_test.shape[1])])

km_train_df = pd.DataFrame(km_train, columns = ['km_ftr_'+str(n) for n in range(km_train.shape[1])])
km_test_df = pd.DataFrame(km_test, columns = ['km_ftr_'+str(n) for n in range(km_test.shape[1])])

nb_train_df = pd.Series(nb_train_probs, name='nb_probs')
nb_test_df = pd.Series(nb_test_probs, name='nb_probs')

eng_ftrs_train = pd.concat([pca_train_df, svd_train_df, km_train_df, nb_train_df], axis=1)
eng_ftrs_test = pd.concat([pca_test_df, svd_test_df, km_test_df, nb_test_df], axis=1)

eng_ftrs_train.shape, eng_ftrs_test.shape

((307511, 361), (48744, 361))

In [33]:
eng_ftrs_train.to_csv('clean_data/eng_ftrs_train.csv',index=False)
eng_ftrs_test.to_csv('clean_data/eng_ftrs_test.csv',index=False)