In [2]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [3]:
data = pd.read_csv("whole_data.csv")

In [4]:
x_train = pd.read_csv("x_train.csv")
y_train = pd.read_csv("y_train.csv", header=None)
x_test = pd.read_csv("x_test.csv")
y_test = pd.read_csv("y_test.csv", header=None)

In [5]:
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (100000, 200)
y_train shape:  (100000, 1)
x_test shape:  (100000, 200)
y_test shape:  (100000, 1)


# Feature Engineering - Frequency Encoding: 

In [6]:
# Frequency encoding:
def encode_FE(df, col, test):
    cv = df[col].value_counts()
    nm = col+'_FE'
    df[nm] = df[col].map(cv)
    test[nm] = test[col].map(cv)
    test[nm].fillna(0, inplace=True)
    
    if cv.max() <= 255:
        df[nm] = df[nm].astype('uint8')
        test[nm] = test[nm].astype('uint8')
    else:
        df[nm] = df[nm].astype('uint16')
        test[nm] = test[nm].astype('uint16')        
    return

In [7]:
comb = pd.concat([x_train, x_test], axis=0, sort=True)

for i in range(200): encode_FE(comb, 'var_' + str(i), x_test)

x_train = comb[:len(x_train)]; 
del comb
print('Added 200 new magic features!')

Added 200 new magic features!


In [8]:
features = [column for column in x_train.columns if column not in ['ID_code', 'target']]

In [13]:
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("x_test shape: ", x_test.shape)
print("x_test shape: ", y_test.shape)
print("features", len(features))

x_train shape:  (100000, 400)
y_train shape:  (100000, 1)
x_test shape:  (100000, 400)
x_test shape:  (100000, 1)
features 400


In [16]:
# save frequency count result:
x_train.to_csv('x_train_FE.csv', index=False)
y_train.to_csv('y_train_FE.csv', index=False)
x_test.to_csv('x_test_FE.csv', index=False)
y_test.to_csv('y_test_FE.csv', index=False)

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import shuffle
    
def train_test_split(data):
    
    whole_data = shuffle(data)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
    
    for train_index, test_index in sss.split(whole_data, whole_data['target'].values):
        print("TRAIN:", train_index, "TEST:", test_index)
        train, test = whole_data.iloc[train_index], whole_data.iloc[test_index]
    
    x_train = train.iloc[:,2:]
    y_train = train['target']
    x_test = test.iloc[:,2:]
    y_test = test['target']
    features = [column for column in train.columns if column not in ['ID_code', 'target']]
    
    print("x_train shape: ", x_train.shape)
    print("y_train shape: ", y_train.shape)
    print("x_test shape: ", x_test.shape)
    print("y_test shape: ", y_test.shape)
    print("features", len(features))
    
    print(y_train[y_train==0].shape)
    print(y_train[y_train==1].shape)
    print("-----------------------------")
    print(y_test[y_test==0].shape)
    print(y_test[y_test==1].shape)
    
    return x_train, y_train, x_test, y_test

In [78]:
import time
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb


def eda_modle_eva (params, x_train, y_train, x_test):
    num_folds = 5
    
    folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=44000)
    oof = np.zeros(len(x_train))
    predictions = np.zeros(len(x_test))
    feature_importance_df = pd.DataFrame()
    
    features = [column for column in x_train.columns if column not in ['ID_code', 'target']]
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(x_train.iloc[trn_idx][features], label=y_train.iloc[trn_idx])
        val_data = lgb.Dataset(x_train.iloc[val_idx][features], label=y_train.iloc[val_idx])
        
        num_round = 1000000
        clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3500)
        oof[val_idx] = clf.predict(x_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
        clf.save_model('{:03d}.txt'.format(fold_))

        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        predictions += clf.predict(x_test[features], num_iteration=clf.best_iteration) / folds.n_splits
    
    return predictions, oof, fold_importance_df, clf

In [79]:
# ALl default values
params = {
    # Core Parameters
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'tree_learner': 'serial',
    'num_threads': 0,
    'seed': 42,
    
    # Learning Control Parameters
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 1e-3,  
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'feature_fraction': 1.0,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'bagging_seed': 42,
    
    # Others
    'verbosity ': 1,
    'boost_from_average': True,
    'metric': 'auc',
}


#print("data shape: ", data_without_outliers.shape)
#x_train, y_train, x_test, y_test = train_test_split(data_without_outliers)
predictions, oof, fold_importance_df, clf = eda_modle_eva(params, x_train, y_train, x_test)


Fold 0
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.961929	valid_1's auc: 0.871397
[2000]	training's auc: 0.987633	valid_1's auc: 0.885187
[3000]	training's auc: 0.996429	valid_1's auc: 0.888325
[4000]	training's auc: 0.999029	valid_1's auc: 0.889291
[5000]	training's auc: 0.999713	valid_1's auc: 0.88958
[6000]	training's auc: 0.999903	valid_1's auc: 0.889641
[7000]	training's auc: 0.999964	valid_1's auc: 0.889864
[8000]	training's auc: 0.999994	valid_1's auc: 0.889898
[9000]	training's auc: 1	valid_1's auc: 0.889916
[10000]	training's auc: 1	valid_1's auc: 0.889983
[11000]	training's auc: 1	valid_1's auc: 0.890027
[12000]	training's auc: 1	valid_1's auc: 0.88981
Early stopping, best iteration is:
[9204]	training's auc: 1	valid_1's auc: 0.889901
Fold 1
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.961292	valid_1's auc: 0.880254
[2000]	training's auc: 0.987573	valid_1's auc: 0.894023
[3000]	training's

In [80]:
from sklearn.metrics import roc_auc_score
print("CV score: {:<8.5f}".format(roc_auc_score(y_train, oof)))
print("CV score: {:<8.5f}".format(roc_auc_score(y_test, predictions)))

CV score: 0.88860 
CV score: 0.90010 
