In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## Memuat Dataset (train) 

In [None]:
%%time
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")

In [None]:
# mengatur maksimum kolom yang dapat ditampilkan.
pd.set_option("display.max_columns", 103)

## Exploratory Data Analysis  - EDA

In [None]:
train_df.shape

In [None]:
train_df.info()

Di dalam dataset **train** kita memiliki 600000 baris dengan jumlah kolom total 102; 100 kolom fitur, 1 kolom `id` , dan 1 kolom `target`.  

Dilanjutkan dengan melihat *summary statistics* dari dataset.

In [None]:
%%time
train_df.describe()

Kesimpulan awal dari *summary statistics*:  
1. Kemungkinan terdapat *outlier*, akan diamati lebih dengan visualisasi data.
2. Apakah akan diperlukan *normalization* atau *standarization*, kita melihat terdapat kolom yang memiliki rentang nilai berbeda misalkan beberapa kolom memiliki nilai maksimum / minimum ratusan (*hundreds*) bahkan ribuan (*thousands*) dengan sebagian besar pada satuan (*ones*).

Melihat distribusi dari kolom `target`, apakah label yang kita miliki *balance* atau *imbalance*?

In [None]:
%%time
train_df['target'].value_counts()

In [None]:
train_df['target'].value_counts().plot(kind='pie', autopct='%.1f')

Proporsi jumlah label target **1** dan **0** terlihat seimbang. label **1** sekitar 50.6% dan label **0** sekitar 49.4% dari seluruh baris(?).

---

#### Mencoba menggunakan Vaex daripada Pandas  
tentang [Vaex](https://vaex.io/docs/index.html)

In [None]:
# import vaex

In [None]:
# %%time
# train_vdf = vaex.open("/kaggle/input/tabular-playground-series-nov-2021/train.csv")

In [None]:
# train_vdf.info(description=False)

In [None]:
# %%time
# train_vdf.describe()

In [None]:
# %%time
# train_vdf.target.value_counts()

In [None]:
# %%time
# train_vdf.f1.countmissing()

Gunakan Pandas, untuk *single machine* di sini masih memiliki performa yang baik.

---

Mari kita lihat distribusi untuk setiap kolom fitur.

In [None]:
sns.displot(data=train_df, x="f0", hue="target", kind="kde", fill=True)

Dalam membuat *multiple subplots* untuk plot dari *library* `seaborn` perhatikan level dari *function* yang digunakan. Contoh `seaborn.displot()` merupakan *figure-level*, sedangkan untuk *axes-level* dapat berupa `seaborn.histplot()` atau `seaborn.kdeplot()`.  

[seaborn is not plotting within defined subplots](https://stackoverflow.com/questions/63895392/seaborn-is-not-plotting-within-defined-subplots)

In [None]:
cols = [c for c in train_df.columns if 'f' in c]

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(cols[:16]):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(cols[16:32]):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(cols[32:48]):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(cols[48:64]):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(cols[64:80]):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(cols[80:96]):
    plt.subplot(4, 4, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

In [None]:
plt.figure(figsize=(12, 8))
for i, feature in enumerate(cols[96:100]):
    plt.subplot(2, 2, i+1)
    sns.kdeplot(data=train_df, x=feature, hue="target", fill=True)

Kesimpulan dari visualisasi distribusi:
* **dalam diskusi**:
- Keberadaan outlier
- Bimodality (Multimodal Distribution)
- Langkah selanjutnya

In [None]:
%%time
train_df.isnull().sum().sort_values(ascending=False)

In [None]:
# cek duplikasi baris
train_df.drop(columns=['id']).duplicated().sum()

Tidak terdapat *missing values* dan duplikasi.

[List Highest Correlation Pairs from a Large Correlation Matrix in Pandas?](https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas)

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=10):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

df = train_df
print("Top Absolute Correlations")
print(get_top_abs_correlations(df, 10))

Baik korelasi antar fitur ataupun dengan target, memiliki korelasi yang lemah.

---

Rencana *splitting* dataset:
1. train test split 90:10
2. 10-fold cross vallidation

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# X = train_df.drop(columns=['id', 'target'])
# y = train_df['target']

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=101)

Gunakan [FLAML](https://github.com/microsoft/FLAML) untuk *hyperparameter tuning* dan menggunakan [LightGBM](https://lightgbm.readthedocs.io/en/latest/index.html) untuk *training* model.

In [None]:
# install FLAML
!pip install FLAML[notebook] -q

In [None]:
# from flaml import AutoML
# automl =  AutoML()

Terjadi *error* dikarenakan perbedaan versi scikit-learn dengan *dependencies package*nya. Namun tetap digunakan untuk memuat model hasil FLAML (dikerjakan di luar Kaggle notebook).

Model didapatkan dari proses training dan *model selection* menggunakan FLAML pada platform [Deepnote](https://deepnote.com/project/AutoML-with-FLAML-_u3I3ayRRPWBj4XF8V4iOA/%2Fnotebook.ipynb).

In [None]:
import pickle

model = pickle.load(open("../input/automl1-lgbm/automl1_lgbm.pkl", "rb"))

## Memuat Dataset test.csv dan sample_submission.csv

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
sample_submission

In [None]:
test_df

In [None]:
predictions_proba = model.predict_proba(test_df.drop(columns=['id']))[:, 1]

output = pd.DataFrame({'id': test_df['id'], 'target': predictions_proba})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")