**<h2 style="color:#0d1a75;"> Import Libraries</h2>**

In [None]:
import os 
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import scipy

from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)

In [None]:
SUBMISSION = '/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv'
TRAIN = '/kaggle/input/tabular-playground-series-may-2022/train.csv'
TEST = '/kaggle/input/tabular-playground-series-may-2022/test.csv'

**<h2 style="color:#0d1a75"> Load Data</h2>**

In [None]:
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)
sub = pd.read_csv(SUBMISSION)

In [None]:
test['target'] = np.NaN

train['train_test'] = 'train'
test['train_test'] = 'test'

df = pd.concat((train, test), axis=0)

In [None]:
train.sample(5)

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
print("Nulls in train data\n")
display(train.isnull().sum())
print("\n\nNulls in test data\n")
display(test.isnull().sum())

* <font size=3>There is No nulls in our data set</font>

In [None]:
print(f'Duplicate rows in train data set: {train.duplicated().sum()}')

In [None]:
sns.countplot(data=df, x='target')

* <font size=3>Classes are almost balanced</font>

* <font size=3>There is No Duplicated rows in our Training set</font><br>
We can start Data analysis Process

**<h2 style="color: #0d1a75;"> EDA </h2>**

In [None]:
f_columns = df.filter(regex='f_').columns

fig, axs = plt.subplots(6, 5, figsize=(20, 10))
for ax, col in zip(axs.flatten(), f_columns):
    try :
        sns.violinplot(x='train_test', y=col, data=df, ax=ax)
        ax.set_title(col)
    except :
        pass

In [None]:
from sklearn.preprocessing import OrdinalEncoder
OE = OrdinalEncoder()
df['f_27_en'] = OE.fit_transform(df.f_27.values.reshape(-1, 1))

sns.boxplot(x='train_test', y='f_27_en', data=df);
df.drop('f_27_en', inplace=True, axis=1)

In [None]:
%%time

comman_idx = []
for idx in train.f_27.value_counts().index[:20]:
    if idx in test.f_27.value_counts().index:
        comman_idx.append(idx)

pd.concat([train.f_27.value_counts()[comman_idx], test.f_27.value_counts()[comman_idx]], axis=1)

In [None]:
fig, axs = plt.subplots(6, 5, figsize=(20, 10))
for ax, col in zip(axs.flatten(), f_columns):
    try :
        sns.distplot(x=train[col].loc[train.target==0], label='target: 0', ax=ax)
        sns.distplot(train[col].loc[train.target==1], label='target: 1', ax=ax)
        ax.legend()
        ax.set_title(col)
    except :
        pass

In [None]:
non_normal_col = ['f_23', 'f_26', 'f_28']
fig, axs = plt.subplots(1, 3, figsize=(20, 8))
for col, ax in zip(non_normal_col, axs.flatten()):
    scipy.stats.probplot(df[col], plot=ax)
    ax.set_title(col)

<h1 style="color:#900C3F ; ";>Feature Engineering</h1>

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder

def mode(s):
    chars = list(set(s))
    counts = [*map(s.count, chars)]
    mode_idx = counts.index(max(counts))
    return chars[mode_idx]

def Feature_Engineering(df):
    
    df['list_27'] = df['f_27'].apply(lambda x : np.array(list(x)))
    for i in range(10):
        df[f'f_27_{i+1}'] = df['list_27'].apply(lambda x : x[i])
        
    
    le = LabelEncoder()
    le.fit(list('abcdefghijklmnopqrstuvwxyz'.upper()))
    for i in range(10):
        df[f'f_27_{i+1}'] = le.transform(df[f'f_27_{i+1}'])
        
    df['nunique_chars'] = df['f_27'].apply(lambda x : len(set(str(x))))
    

    
    df['f_27_mode'] = df['f_27'].apply(mode)
    
    return df

df = Feature_Engineering(df)

<h1 style="color:#900C3F;"> EDA After Feature Engineering </h1>

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(df['nunique_chars'], hue=df['target'])
plt.title('Distribution of No. Unique Characters in f_27', size=20)
plt.ylabel('Count', size=15)
plt.xlabel('No. Characters', size=15)

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(20, 12))
cols = df.filter(regex='f_27_').columns
for ax, col in zip(axs.flatten(), cols[:10]):
    sns.distplot(df[col], ax=ax)
    ax.set_title(col)

fig.suptitle("Distribution of Characters in each position")

In [None]:
plt.figure(figsize=(20, 6))
sns.countplot(df['f_27_mode'], hue=df['target'])
plt.legend(loc='upper right', title='target')

<h2 style="color: #900C3F ;"> Train Model</h2>

In [None]:
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder()
ohe_cols = OHE.fit_transform(df['f_27_mode'].values.reshape(-1, 1)).toarray()
df = pd.concat((df, pd.DataFrame(ohe_cols, index=df.index)), axis=1)

In [None]:
train = df.loc[df.train_test=='train']
train = train.drop(labels=['id', 'train_test', 'f_27', 'list_27', 'f_27_mode'], axis=1)

test = df.loc[df.train_test=='test']
test = test.drop(labels=['id', 'train_test', 'f_27', 'list_27', 'f_27_mode'], axis=1)

X_train = train.drop(['target'], axis=1).values
y_train = train['target'].values

X_test = test.drop(['target'], axis=1).values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=101)

In [None]:
import tensorflow as tf
from tensorflow import keras as keras
from keras import layers

In [None]:
model = keras.Sequential([
    layers.Input(X_train.shape[1:]),
    
    layers.Dense(units=512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.2),
    
    layers.Dense(units=255, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.2),
    
    layers.Dense(units=128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.2),
    
    layers.Dense(units=32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.2),
    
    layers.Dense(units=1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, metrics=['accuracy'], loss='binary_crossentropy')

es = keras.callbacks.EarlyStopping(patience=10
                                  ,restore_best_weights=True
                                  ,monitor='val_loss')

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=[es], batch_size=256, epochs=13)

In [None]:
y_pred = model.predict(X_test)
sub['target'] = y_pred

In [None]:
sub.to_csv('submission.csv', index=False)