# Importing Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
train.head()

# EDA

In [None]:
print(test.isna().sum())
print(train.isna().sum())

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
#dropping id column

train.drop(['id'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)


In [None]:
train.head()

In [None]:
train['target'].value_counts()

In [None]:
train.corr()

In [None]:
train.info()

In [None]:
float_cols = [f for f in train.columns if train[f].dtype == 'float64']

plt.subplots(figsize=(30,35))
for i,column in enumerate(float_cols):
    plt.subplot(4,4,i+1)
    sns.histplot(data=train,x=column, hue='target')
    plt.title(column)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train[float_cols+['target']].corr(),center=0,annot=True,fmt='.2f')
plt.show()

Correlation matrix insights:
1. f_00 to f_06 are connected to f_28
2. f_19 to f_26 are slightly correlated with each other

In [None]:
int_cols = [f for f in train.columns if train[f].dtype == 'int64']

plt.subplots(figsize=(30,35))
for i,f in enumerate(int_cols):
    val_count = train[f].value_counts()
    ax = plt.subplot(5,3,i+1)
    ax.bar(val_count.index,val_count.values)
    plt.title(f)

Looking at histograms of integer features, first 12 features all have values between 0 and 16. The last 2 features, f_29 is binary and f_30 is ternary.
> Ignore target.

# f_27 - String Feature

f_27 is a string feature which cannot be used as a feature as it is. Let's find a work around.

In [None]:
train['f_27'].str.len().min(), train.f_27.str.len().max(), test.f_27.str.len().min(), test.f_27.str.len().max()

string has a fixed length of 10 characters.

In [None]:
train.f_27.value_counts()

In [None]:
import string
alpha_upper = list(string.ascii_uppercase) #assigning all alphabets as upper case to this list

char_counts=[]
for characters in alpha_upper: #running a for loop through each alphabet
    char_counts.append(train['f_27'].str.count(characters).sum())  #creating a list of sum of all characters present in the column 'f_27'
    
char_counts_df = pd.DataFrame({'Character':alpha_upper, 'Character Count':char_counts}) #Creating a df for the above list
char_counts_df = char_counts_df.loc[char_counts_df['Character Count']>0] #removing the non-existing alphabest from 'f_27'
print(np.sum(char_counts))

plt.subplots(figsize=(20,10))
sns.barplot(data=char_counts_df,y='Character Count',x='Character')
plt.title("Total Number of characters in train[f_27]");

Finding the count of unique characters in string.

In [None]:
char_counts_df =  char_counts_df.set_index("Character",drop=False)
for i in range(10):
    char_counts_df['character'+str(i+1)] = train['f_27'].str[i].value_counts()
char_counts_df = char_counts_df.fillna(0)

plt.subplots(figsize=(30,12))
characters_cols = [i for i in char_counts_df.columns if 'character' in i] 
for i,f in enumerate(characters_cols):
    plt.subplot(2,5,i+1)
    sns.barplot(data=char_counts_df,x='Character',y=f,color='blue')
    plt.title('Charaacter value counts in position: '+str(i+1));
    

In [None]:
train['f_00 + f_01'] = train['f_00'] + train['f_01']

In [None]:
plt.subplots(figsize=(30,30))

plt.subplot(3,3,1)
sns.scatterplot(data=train,y='f_00 + f_01',x='f_26',hue='target',s=2);

plt.subplot(3,3,2)
sns.scatterplot(data=train,y='f_02',x='f_21',hue='target',s=2);

plt.subplot(3,3,3)
sns.scatterplot(data=train,y='f_05',x='f_22',hue='target',s=2);

We need to create a feature which returns a value of 1 for target value 1, 0 for target value 0 & 1, -1 for target value 0

ref - [https://www.kaggle.com/competitions/tabular-playground-series-may-2022/discussion/323892]()

# Feature Engineering

creating a function to feature engineer both train and test data

In [None]:
y_train = train['target']

In [None]:

train = train.drop(columns=['target','f_00 + f_01'])
train.info()

In [None]:
train.shape,test.shape

In [None]:
def fe(df):
    new_df = df.copy()
    
    #for the 3 features visualized 2 cells above
    
    new_df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    
    new_df['i_05_22'] = (df.f_22 + df.f_05 > 5.2).astype(int) - (df.f_22 + df.f_05 < -5.3).astype(int)
    
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    new_df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.3).astype(int)
    
    #Encoding characters ordinally
    
    for i in range(10):
        new_df['f_27_'+str(i)] = new_df['f_27'].str[i].apply(lambda x:ord(x) - ord('A'))
        
    #getting number of unique characters in 'f_27'
    
    new_df['unique_charaacters'] = new_df['f_27'].apply(lambda x: len(set(x)))
    
    new_df = new_df.drop(columns=['f_27'])
    return new_df

In [None]:
%%time

train = fe(train)
test = fe(test)

In [None]:
train.shape,test.shape

# Neural Network Model

In [None]:
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, Input
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import regularizers, callbacks

In [None]:
train_columns = train.columns
test_columns = test.columns

In [None]:
scaler = StandardScaler()

train = scaler.fit_transform(train)
test = scaler.fit_transform(test)

In [None]:
train = pd.DataFrame(data=train,columns=train_columns)
test = pd.DataFrame(data=test,columns=test_columns)

train.head()

In [None]:
input_shape = (44,)

In [None]:
features = [f for f in test.columns if f != 'id' and f != 'f_27']

In [None]:
def my_model():
    """Simple sequential neural network with four hidden layers.
    
    Returns a (not yet compiled) instance of tensorflow.keras.models.Model.
    """
    activation = 'swish'
    inputs = Input(shape=(len(features)))
    x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(inputs)
    x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(x)
    x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(x)
    x = Dense(16, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(x)
    x = Dense(1, #kernel_regularizer=tf.keras.regularizers.l2(1e-6),
              activation='sigmoid',
             )(x)
    model = Model(inputs, x)
    return model

model = my_model()
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['AUC'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss',patience=5)

history = model.fit(train,
                   y_train,
                   validation_split=0.35,
                   shuffle=True,
                   epochs=100,
                   batch_size = 2048,
                   callbacks = [callback])

In [None]:
preds = model.predict(test)


In [None]:
sample_sub = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
sample_sub['target'] = preds

In [None]:
sample_sub.to_csv('submission.csv', index=False)