# Preparation

In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib                # For plotting and visualization
import matplotlib.pyplot as plt  
from pandas.plotting import parallel_coordinates
import seaborn as sns            # For statistical data visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# For machine learning

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, roc_auc_score, median_absolute_error, matthews_corrcoef,
                             f1_score, confusion_matrix, classification_report)
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier

from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Dropout,Input,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import optuna

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')
df_test.head()

# Data Cleaning

In [None]:
for column in df_train.columns:
    mode = df_train[column].mode()[0]
    df_train[column].fillna(mode, inplace=True)

for column in df_test.columns:
    mode = df_test[column].mode()[0]
    df_test[column].fillna(mode, inplace=True)


In [None]:
for column in df_train.columns:
    # Calculate frequency counts for each value
    value_counts = df_train[column].value_counts()
    # Identify values with frequency lower than 20
    low_frequency_values = value_counts[value_counts < 20].index
    # Calculate the mode of the column
    mode_value = df_train[column].mode()[0]
    # Replace low-frequency values with the mode
    df_train[column] = df_train[column].apply(lambda x: mode_value if x in low_frequency_values else x)

for column in df_test.columns:
    # Calculate frequency counts for each value
    value_counts = df_test[column].value_counts()
    # Identify values with frequency lower than 20
    low_frequency_values = value_counts[value_counts < 20].index
    # Calculate the mode of the column
    mode_value = df_test[column].mode()[0]
    # Replace low-frequency values with the mode
    df_test[column] = df_test[column].apply(lambda x: mode_value if x in low_frequency_values else x)

# Preprocessing

In [None]:
df_train = df_train.drop(['id'], axis = 1)
df_test = df_test.drop(['id'], axis = 1)

In [None]:
X = df_train.drop(columns = ['class'] , axis = 1)
y = df_train['class']

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.frequency_maps = {}

    def fit(self, X, y=None):
        for column in self.columns:
            frequency = X[column].value_counts()
            self.frequency_maps[column] = frequency
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column + '_freq'] = X_transformed[column].map(self.frequency_maps[column])
        return X_transformed

In [None]:
# Define columns for frequency encoding
freq_cols = [column for column in df_train.select_dtypes(include='object').columns if column != 'class']

# Define numerical columns
num_cols = [column for column in X.columns if column not in freq_cols]

# Initialize FrequencyEncoder with the specified columns
freq_encoder = FrequencyEncoder(columns=freq_cols)

# Fit the frequency encoder to the data
freq_encoder.fit(X)

# Transform the data to include frequency encoded columns
X_freq_encoded = freq_encoder.transform(X)
df_test_freq_encoded = freq_encoder.transform(df_test)

# Define the preprocessing for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('freq', 'passthrough', [col + '_freq' for col in freq_cols])
    ])

# Fit and transform the training data
X_processed = preprocessor.fit_transform(X_freq_encoded)
df_test_processed = preprocessor.transform(df_test_freq_encoded)

# Convert processed arrays back to DataFrames
X_processed = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())
df_test_processed = pd.DataFrame(df_test_processed, columns=preprocessor.get_feature_names_out())

# Keep dataframe names consistent
X = X_processed
df_test = df_test_processed

# Deep Learning

In [None]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=2)

le = LabelEncoder()
# Convert the target variable 'response' to numerical data
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_train

# Define model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(512, activation='linear'),
    BatchNormalization(),
    Dropout(0.36), 
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3), 
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.24),  
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),  
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.16),  
    Dense(16, activation='relu'), 
    BatchNormalization(),
    Dropout(0.12),  
    Dense(8, activation='relu'), 
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])

# Compile model
little_adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy',optimizer=little_adam, metrics=['accuracy'])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.15, patience=25, min_lr=1e-6)

# Train model
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=128,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])



# Plot training & validation loss values
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'])

# Plot training & validation accuracy values
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'])

plt.show()

# Submission

In [None]:
test_predictions = (model.predict(df_test_processed) > 0.5).astype(int).flatten()
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e8/sample_submission.csv')
df_sub['class'] = np.where(test_predictions == 1, 'p', 'e')
df_sub.to_csv('submission_nn.csv', index=False)