Big thanks to:
* Jiwei Liu for [Augment insight](https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment)
* [Ole Morten Grodås, Lightgbm with data augmentation](https://www.kaggle.com/omgrodas/lightgbm-with-data-augmentation)


In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("../input/train.csv").drop("ID_code",axis=1)
y_train = train['target']
X_train = train.drop(['target'], axis = 1)
X_test = pd.read_csv("../input/test.csv").drop("ID_code",axis=1)

# Sampling

In [None]:
# use sample instead of all data
sample = 0
if(sample == 1):
    sample_train_0 = train[train.target == 0].sample(n = 4000, random_state = 1573456)
    sample_train_1 = train[train.target == 1].sample(n = 400, random_state = 1573456)
    
    train = sample_train_0.append(sample_train_1)
    y_train = sample_train['target']
    X_train = sample_train.drop(['target'], axis = 1)

# PCA features

In [None]:
# Scaling
mmscale = MinMaxScaler()  
X_train_scaled = mmscale.fit_transform(X_train)  
X_test_scaled = mmscale.transform(X_test)

In [None]:
# PCA
pca = PCA()  
factors_train = pca.fit_transform(X_train_scaled) 
factors_test = pca.transform(X_test_scaled)

In [None]:
explained_variance = pca.explained_variance_ratio_  

In [None]:
pd.DataFrame(explained_variance,columns=['explained_variance']).plot(kind='box')

In [None]:
with plt.style.context('dark_background'):
    plt.figure(figsize=(15, 12))

    plt.bar(range(200), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()

In [None]:
sum(explained_variance[:200])

In [None]:
factors_train = pd.DataFrame(factors_train)
factors_test = pd.DataFrame(factors_test)

In [None]:
train_combined = train.merge(factors_train, left_index = True, right_index = True)
test_combined = X_test.merge(factors_test, left_index = True, right_index = True)

# Augmentation

In [None]:
# Inspiration from [2]
def augment(train, num_n = 1, num_p = 2):
    newtrain=[train]
    
    n = train[train.target == 0]
    for i in range(num_n):
        newtrain.append( n.apply( lambda x:x.values.take(np.random.permutation(len(n))) ) )
    
    p = train[train.target == 1]
    for i in range(num_p):
        newtrain.append( p.apply( lambda x:x.values.take(np.random.permutation(len(p))) ) )
    
    return pd.concat(newtrain)

In [None]:
param = {
    #'bagging_freq': 5,
    #'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.009,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 70,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 15,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [None]:
train_augmented = augment(train_combined)
#train_augmented = train_combined
train_data = lgb.Dataset(train_augmented.drop('target', axis = 1), train_augmented['target'])
# with augmentation 12895 -> 0.898
# without augmentation 12895 -> 0.897
model = lgb.train(param, train_data, 19000)

In [None]:
prediction = model.predict(test_combined)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['target'] = prediction
filename = "sub_{:%Y-%m-%d_%H_%M}.csv".format(datetime.now())
submission.to_csv(filename, index=False)

# Inspired by:

[1] [Ole Morten Grodås, "Lightgbm with data augmentation"](https://www.kaggle.com/omgrodas/lightgbm-with-data-augmentation)

[2] [Jiwei Liu, "LGB 2 leaves + augment"](https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment)

[3] [Shrutti_Lyyer, "Santander customer transaction, PCA and NB"](https://www.kaggle.com/shrutimechlearn/santander-customer-transaction-pca-and-nb)