In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import gc
import random
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from IPython import display as ipd

from pandas_profiling import ProfileReport as profile

import pkg_resources as pkg
print( f"pandas_profiling version: {pkg.get_distribution('pandas_profiling').version}")

from tqdm import tqdm
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc, cohen_kappa_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
RANDOM_SEED = 42
DEBUG = False
PROFILE = False

def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')

seeding(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

if DEBUG:
    train = train[:50000]
    
target = train.target
train.drop(['id','target'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
print('train:',train.shape)
print('test:',test.shape)

## EDA

In [None]:
%%time

if PROFILE:
    train_profile = profile(train, title="Train Data", minimal=True)
    display(train_profile)

## Dealing with skew data

In [None]:
from sklearn.preprocessing import MinMaxScaler

def minmax_scale(df, cols):
    scaler = MinMaxScaler()
    for col in cols:
        df[col] = scaler.fit_transform(df[col].values.reshape(-1,1))

skewed_cols = ['f46', 'f59', 'f89']
minmax_scale( train, skewed_cols)
minmax_scale( test, skewed_cols)

## PCA

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

pca = PCA(n_components=2)
X_pca_train = pca.fit_transform(train_scaled)
X_pca_test = pca.transform(test_scaled)

f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 6))
ax1.scatter(X_pca_train[:,0],X_pca_train[:,1],c=target,cmap='rainbow')
ax2.scatter(X_pca_test[:,0],X_pca_test[:,1],cmap='rainbow')
plt.show()

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
count_classes = pd.value_counts(target, sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("histogram")
plt.xlabel("traget")
plt.ylabel("Frequency")

In [None]:
%%time

def run_train(X, y, run_params, splits, num_boost_round, verbose_eval, early_stopping_rounds ):
    scores = []
    models = []
    evals_results = {}  # to record eval results for plotting
    folds = KFold(n_splits=splits)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'Fold {fold_n+1} started')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model = lgb.train(
            run_params, valid_names=["train", "valid"], 
            train_set=lgb.Dataset(X_train, y_train ), 
            num_boost_round = num_boost_round,
            valid_sets = [lgb.Dataset(X_valid, y_valid)],
            verbose_eval = verbose_eval,
            evals_result=evals_results,
            early_stopping_rounds = early_stopping_rounds,
        )

        y_predicted = model.predict(X_valid)
        score = roc_auc_score(y_valid, y_predicted)   
        print(f'roc_auc_score: {score}')

        models.append(model)
        scores.append(score)
    return scores, models, evals_results


LEARNING_RATE = 0.00497
MAX_DEPTH = -1
NUM_LEAVES = 250    
TOTAL_SPLITS = 6
NUM_BOOST_ROUND = 4000
EARLY_STOPPING_ROUNDS = 100
VERBOSE_EVAL = 250    
    
negative = target.value_counts()[0]
positive = target.value_counts()[1]
scale_pos_weight = negative / positive    
    
run_params = {
    'verbose': -1, 
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': ['auc', 'binary_logloss'],
    'learning_rate': LEARNING_RATE, 
    'num_leaves': NUM_LEAVES, 
    'scale_pos_weight':scale_pos_weight,
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.5, 
    #'bagging_freq': 4, 
    'max_depth': MAX_DEPTH, 
}

FEATURES = [col for col in train.columns if col.startswith('f')]
scores, models, evals_results = run_train(train, target, run_params, TOTAL_SPLITS, NUM_BOOST_ROUND, 
                                          VERBOSE_EVAL, EARLY_STOPPING_ROUNDS)
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

## Plotting metrics recorded during training

In [None]:
ax = lgb.plot_metric(evals_results, metric='auc')
plt.show()

ax = lgb.plot_metric(evals_results, metric='binary_logloss')
plt.show()

In [None]:
predicted = []
for model in models:
    predicted.append(model.predict(test))

avg_preds = np.zeros(len(predicted[0]))
for pred in predicted:
    avg_preds += pred
avg_pred = avg_preds / len(models)

In [None]:
submission['target'] = avg_pred
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)