In [None]:
import numpy as np
import pandas as pd
import random
import os

# Scaling
from sklearn.preprocessing import StandardScaler

# Model
from sklearn.ensemble import RandomForestClassifier
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
import tensorflow as tf

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer


# History
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
my_seed = 42
def seedAll(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seedAll(my_seed)

In [None]:
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")

In [None]:
train.head()

## Split train & valid

In [None]:
train = train.drop('id', axis = 1)

In [None]:
valid_ratio = 0.1
valid_len = int(valid_ratio * len(train))

In [None]:
# shuffle
train = train.sample(frac=1)
valid = train[:valid_len]
train = train[valid_len:]
valid_X, valid_Y = valid.iloc[:, :-1], valid.iloc[:, -1]
train_X, train_Y = train.iloc[:, :-1], train.iloc[:, -1]
print('valid_len : {} / train_len : {}'.format(len(valid_X), len(train_X)))

In [None]:
valid_X.head()

In [None]:
def f1_score(y, pred):
    if len(y) != len(pred):
        raise "Difference size"
    
    TN, TP, FN, FP = 0, 0, 0, 0
    
    for i in range(len(y)):
        a = y[i]
        b = pred[i]
        
        if a == b and b == 0:
            TN += 1
        elif a == b and b == 1:
            TP += 1
        elif a != b and b == 0:
            FN += 1
        else:
            FP += 1
    
    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    print('TN : {} / TP : {} / FN : {} / FP : {}'.format(TN, TP, FN, FP))
    print('recall : {} / precision : {}'.format(recall, precision))
    
    return (2 * precision * recall) / (precision + recall)

## Model(Random forest)

In [None]:
forest = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=3)
forest.fit(train_X, train_Y)

In [None]:
importances = forest.feature_importances_
indices = np.argsort(importances)
features = train_X.columns

indices = indices[-10:]

In [None]:
plt.figure(figsize=(8,8))
plt.title('Feature Importances TOP 10')
sns.barplot(importances[indices], [features[i] for i in indices], palette="YlOrRd")
plt.xlabel('Relative Importance')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
pred_Y = forest.predict(valid_X)
round(accuracy_score(valid_Y.values,pred_Y)*100, 2)

In [None]:
pred_Y = forest.predict(valid_X)
f1_forest = f1_score(valid_Y.values, pred_Y)
print('f1 forest : ', f1_forest)

## Model(NN)

In [None]:
# def build_model():
#     early_stopping = callbacks.EarlyStopping(
#         patience=20,
#         min_delta=0,
#         monitor='val_loss',
#         restore_best_weights=True,
#         verbose=0,
#         mode='min', 
#         baseline=None,
#     )

#     plateau = callbacks.ReduceLROnPlateau(
#             monitor='val_loss', 
#             factor=0.5, 
#             patience=5, 
#             verbose=0,
#             mode='min')
    
#     model = keras.Sequential([
#         layers.Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(100, 1)),
#         layers.MaxPooling1D(pool_size=2),
#         layers.Flatten(),
#         layers.Dropout(0.3),
#         layers.Dense(100, activation='relu'),
#         layers.Dense(32, activation='relu'),
#         layers.Dense(1, activation='sigmoid')
#     ])
    
#     optimizer = optimizers.Adam(learning_rate=1e-3)
    
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC'])
#     return model, early_stopping, plateau

In [None]:
# ## from sklearn.model_selection import StratifiedKFold
# kfold = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
# models = []
# histories = []

# for i, (train_idx, valid_idx) in enumerate(kfold.split(train_X.values, train_Y.values)):
#     X_train, X_valid = train_X.iloc[train_idx], train_X.iloc[valid_idx]
#     Y_train, Y_valid = train_Y.iloc[train_idx], train_Y.iloc[valid_idx]
# #     print(X_train.shape)
#     X_train = np.expand_dims(X_train, axis=2)
#     X_valid = np.expand_dims(X_valid, axis=2)
#     print(X_train.shape)
    
#     # fit
#     model, early_stopping, plateau = build_model()
#     history = model.fit(X_train, Y_train,
#             validation_data = (X_valid, Y_valid),
#             batch_size = 64, 
#             epochs = 30,
#             callbacks = [early_stopping, plateau],
#             shuffle = True,
#             verbose = 2
#             )
#     print('='*15 + 'Fold {} end '.format(i+1) + '='*10)
    
#     # history
#     models.append(model)
#     histories.append(history)
    

## History

In [None]:
# fig, ax = plt.subplots(3, 2, tight_layout=True, figsize=(15, 10))
# for idx, history in enumerate(histories):
#     ax[idx][0].plot(history.history['auc'], 'b', label='train acc')
#     ax[idx][0].plot(history.history['val_auc'], 'g', label='valid auc')
#     ax[idx][0].legend(loc='upper left')
#     ax[idx][0].set_title('epoch {} acc'.format(idx+1))
    
#     ax[idx][1].plot(history.history['loss'], 'r', label='train loss')
#     ax[idx][1].plot(history.history['val_loss'], 'y', label='valid loss')
#     ax[idx][1].legend(loc='upper right')
#     ax[idx][1].set_title('epoch {} loss'.format(idx+1))

## Predict

In [None]:
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
sub = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
sub_X = test.iloc[:, :-1]
print('test_len : {}'.format(len(test)))

In [None]:
# forest_Y = forest.predict(sub_X)
# models_Y = [forest_Y]

# for model in models:
#     pred = model.predict(np.expand_dims(sub_X, axis=2))
#     pred = pred.reshape(1, len(pred))
#     models_Y.append(pred)

# models_Y = np.array(models_Y)

In [None]:
# sub['target'] = np.mean(models_Y, axis = 0)[0]
# sub.to_csv('submission.csv', index=False)
# sub.head()

In [None]:
forest_Y = forest.predict(sub_X)
sub['target'] = forest_Y
sub.to_csv('submission.csv', index=False)
sub.head()