In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
import lightgbm as lgb

# Loading data

In [None]:
train = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('../input/santander-customer-transaction-prediction/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.describe()

In [None]:
test.describe()

# Missing value

In [None]:
train.isnull().sum().value_counts()

In [None]:
test.isnull().sum().value_counts()

# Reduce memory

In [None]:
def reduce_mem_df(df):
    before = df.memory_usage().sum() / 1024**2
    print(f'Before memory reducing: {before}')
    for col in df.columns:
        if df[col].dtype != 'object':
            IsInt = False
            mn = df[col].min()
            mx = df[col].max()
            
            check = (df[col] - df[col].astype(np.int64)).sum()
            if -0.01 < check and check < 0.01:
                IsInt = True
            if IsInt:
                if mn >= 0:
                    if mx < 2*8 - 1:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 2*16 - 1:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 2*32 - 1:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)
    after = df.memory_usage().sum() / 1024**2
    print(f'After memory reducing: {after}')

In [None]:
reduce_mem_df(train)

In [None]:
reduce_mem_df(test)

# EDA

In [None]:
train['target'].value_counts(normalize=True)

In [None]:
data = train['target'].value_counts()
plt.figure(figsize=(10, 5), dpi=200)
sns.barplot(x=data.index, y=data.values, palette='winter')
plt.show()

In [None]:
train_correlations = train.drop(['ID_code', 'target'], axis=1).corr()
train_correlations = train_correlations.values.flatten()
train_correlations = train_correlations[train_correlations != 1]

test_correlations = test.drop(['ID_code'], axis=1).corr()
test_correlations = test_correlations.values.flatten()
test_correlations = test_correlations[test_correlations != 1]

In [None]:
plt.figure(figsize=(10, 5), dpi=200)
sns.histplot(train_correlations, color='Blue', kde=True, label='train')
sns.histplot(test_correlations, color='Green', kde=True, label='test')
plt.legend()
plt.title('Features correlation distribution', family='serif', weight='bold', size=15)
plt.show()

In [None]:
y = train['target']
x = train.drop(['ID_code', 'target'], axis=1)
x_test = test.drop('ID_code', axis=1)

In [None]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Model

### LightGBM

In [None]:
model = lgb.LGBMClassifier(num_leaves=128, max_depth=16, n_estimators=1500, metrics='auc', 
                           boosting_type='gbdt', learning_rate=0.01)

In [None]:
scores = cross_val_score(model, x, y, scoring='accuracy', n_jobs=-1, cv=folds)
np.mean(scores)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=42, test_size=0.15)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], 
          early_stopping_rounds=100, verbose=100)

In [None]:
params = {'num_leaves': 128,
         'max_depth': 16,
         'learning_rate': 0.01,
         'boosting': 'gbdt',
         'random_state': 42,
         'metric': 'auc'}
predictions = np.zeros(len(x_test))
features_importance = pd.DataFrame()
features_importance['feature'] = x.columns
features_importance['importance'] = 0
for train_index, valid_index in folds.split(x, y):
    x_train, y_train = x.loc[train_index], y[train_index]
    x_valid, y_valid = x.loc[valid_index], y[valid_index]
    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_valid, label=y_valid)
    
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], num_boost_round=1500, verbose_eval=500, early_stopping_rounds=150)
    y_pred = model.predict(x_test, num_iteration=model.best_iteration)
    predictions += y_pred
    features_importance['importance'] += model.feature_importance()
predictions /= 10
features_importance['importance'] /= 10

In [None]:
features_importance.sort_values(by='importance', ascending=False)[:10]

In [None]:
test_id = test['ID_code']
submit = pd.DataFrame({'ID_code': test_id, 'target': predictions})
submit.to_csv('./submit.csv', index=False)