In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer, RobustScaler, PolynomialFeatures, MinMaxScaler
from sklearn import metrics
from xgboost import XGBClassifier
import gc
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.callbacks import ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping


In [None]:
def plot_feature_importance(importance, names, model_type, max_features = 10):
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df = fi_df.head(max_features)
    plt.figure(figsize=(15,15))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
df = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")
print(df.shape)
df.head()

In [None]:
def data_optimize(df, object_option=False):
# loop columns in the dataframe to downcast the dtype
    for col in df.columns:
        # process the int columns
        if df[col].dtype == 'int':
            col_min = df[col].min()
            col_max = df[col].max()
            # if all are non-negative, change to uint
            if col_min >= 0:
                if col_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif col_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif col_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                else:
                    df[col] = df[col]
            else:
                # if it has negative values, downcast based on the min and max
                if col_max < np.iinfo(np.int8).max and col_min > np.iinfo(np.int8).min:
                    df[col] = df[col].astype(np.int8)
                elif col_max < np.iinfo(np.int16).max and col_min > np.iinfo(np.int16).min:
                    df[col] = df[col].astype(np.int16)
                elif col_max < np.iinfo(np.int32).max and col_min > np.iinfo(np.int32).min:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col]
                    
        # process the float columns
        elif df[col].dtype == 'float':
            col_min = df[col].min()
            col_max = df[col].max()
            # downcast based on the min and max
            if col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col]

        if object_option:
            if df[col].dtype == 'object':
                if len(df[col].value_counts()) < 0.5 * df.shape[0]:
                    df[col] = df[col].astype('category')

    return df

In [None]:
def add_letters_count(data):
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    for char in letters:
        data[char] = data['f_27'].str.count(char)
    return data

In [None]:
def remove_zero(data):    
    cross_check = "UVWXYZ"
    for char in cross_check:
        if data[char].sum() == 0:
            data = data.drop([char], axis=1)
    return data

In [None]:
def add_pos(data):
    for i in range(10):
        data['pos' + str(i)] = (data['f_27'].str[i]).apply(lambda x: ord(x)) - 75
    return data


In [None]:
df = data_optimize(df)

In [None]:
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

In [None]:
df['mean_0'] = df[['f_00', 'f_01', 'f_02','f_03', 'f_04','f_05', 'f_06']].mean(axis=1)
df['mean_1'] = df[['f_07', 'f_08', 'f_09','f_10', 'f_11','f_12', 'f_13', 'f_14', 'f_15', 'f_16','f_17','f_18']].mean(axis=1)
df['mean_2'] = df[['f_19', 'f_20', 'f_21','f_22', 'f_23','f_24', 'f_25', 'f_26']].mean(axis=1)

df['f_sum_2']  = (df['f_21']+df['f_22'])
df['f_sum_3']  = (df['f_23']-df['f_20'])
df['f_sum_4']  = (df['f_25']-df['f_28']/100)
df['f_sum_5']  = (df['f_00']+df['f_01'])
df['f_sum_10'] = (df['f_07']-df['f_10'])
df['f_sum_13'] = (df['f_08']-df['f_10'])

df['f_27'] = df['f_27'].str.upper()
df['length'] = df['f_27'].str.len()
df = add_letters_count(df)
df = remove_zero(df)
df = add_pos(df)
col_list= ['pos0', 'pos1', 'pos2','pos3', 'pos4', 'pos5','pos6', 'pos7', 'pos8','pos9']
df['score'] = df[col_list].sum(axis=1)

df['unique_chars_cnt'] = df['f_27'].map(lambda x: len(set(x)))

df['value_frequency'] = df['f_27'].map(df['f_27'].value_counts() / len(df))

print(df.shape)
df.pop('id')
df.pop('f_27')

target = df.pop('target')





In [None]:
categ_cols = ['f_29','f_30','f_13', 'f_18','f_17','f_14','f_11','f_10','f_09','f_15','f_07','f_12','f_16','f_08','f_27']
continuous_feat = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28']

def stat_features(df, cols = continuous_feat):
    df['f_sum']  = df[continuous_feat].sum(axis=1)
    df['f_min']  = df[continuous_feat].min(axis=1)
    df['f_max']  = df[continuous_feat].max(axis=1)
    df['f_std']  = df[continuous_feat].std(axis=1)    
    df['f_mad']  = df[continuous_feat].mad(axis=1)
    df['f_mean'] = df[continuous_feat].mean(axis=1)
    df['f_kurt'] = df[continuous_feat].kurt(axis=1)
    df['f_count_pos']  = df[continuous_feat].gt(0).count(axis=1)
    return df

df = stat_features(df)

print(df.shape)

In [None]:
features = df.copy()

X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size = 0.1, random_state=10)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
sha = features.shape[1]
sha

In [None]:
lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.7, patience = 4, verbose = 1)
es = EarlyStopping(monitor = 'val_loss',patience = 12, verbose = 1, mode = 'min', restore_best_weights = True)
chp = ModelCheckpoint("model_1.hdf5", monitor='val_loss', verbose=1,save_best_only=True, mode='auto', save_freq='epoch')
tm = tf.keras.callbacks.TerminateOnNaN()


model_2 = keras.Sequential([
  layers.Input(shape=sha),
    
  layers.Dense(64, activation='relu'),
  layers.BatchNormalization(),
    
  layers.Dense(64, activation='relu'),
  layers.BatchNormalization(),

  layers.Dense(64, activation='relu'),
  layers.BatchNormalization(),

  layers.Dense(16, activation='relu'),
  layers.BatchNormalization(),

  layers.Dense(1, activation='sigmoid')
])

model_2.compile(
    optimizer = 'adam',
    loss='binary_crossentropy',
    metrics = ['AUC']
)

model_2.summary()

history_2 = model_2.fit(X_train, Y_train,
                    verbose = 1,
                    batch_size = 4096,
                    epochs = 300,
                    validation_data = (X_val, Y_val),
                       callbacks=[[lr, es, tm, chp]])

In [None]:
history_df_2 = pd.DataFrame(history_2.history)
history_df_2.loc[:,['loss','val_loss']].plot()
history_df_2.loc[:,['auc','val_auc']].plot()
plt.show()

In [None]:
df_test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")
df_test.head()

df_test['mean_0'] = df_test[['f_00', 'f_01', 'f_02','f_03', 'f_04','f_05', 'f_06']].mean(axis=1)
df_test['mean_1'] = df_test[['f_07', 'f_08', 'f_09','f_10', 'f_11','f_12', 'f_13', 'f_14', 'f_15', 'f_16','f_17','f_18']].mean(axis=1)
df_test['mean_2'] = df_test[['f_19', 'f_20', 'f_21','f_22', 'f_23','f_24', 'f_25', 'f_26']].mean(axis=1)

df_test['f_sum_2']  = (df_test['f_21']+df_test['f_22'])
df_test['f_sum_3']  = (df_test['f_23']-df_test['f_20'])
df_test['f_sum_4']  = (df_test['f_25']-df_test['f_28']/100)
df_test['f_sum_5']  = (df_test['f_00']+df_test['f_01'])
df_test['f_sum_10'] = (df_test['f_07']-df_test['f_10'])
df_test['f_sum_13'] = (df_test['f_08']-df_test['f_10'])

df_test['f_27'] = df_test['f_27'].str.upper()
df_test['length'] = df_test['f_27'].str.len()
df_test = add_letters_count(df_test)
df_test = remove_zero(df_test)
df_test = add_pos(df_test)
col_list= ['pos0', 'pos1', 'pos2','pos3', 'pos4', 'pos5','pos6', 'pos7', 'pos8','pos9']
df_test['score'] = df_test[col_list].sum(axis=1)
df_test['unique_chars_cnt'] = df_test['f_27'].map(lambda x: len(set(x)))
df_test['value_frequency'] = df_test['f_27'].map(df_test['f_27'].value_counts() / len(df_test))

print(df_test.shape)

df_test.pop('id')
df_test.pop('f_27')

df_test = stat_features(df_test)
    
print(df_test.shape)



In [None]:
test = scaler.transform(df_test)

predictions = model_2.predict(test)
predictions_df = pd.DataFrame(predictions)

In [None]:
ss = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
ss['target'] = np.where(predictions>0.5, 1, 0)
ss['target'].value_counts().plot.bar()
plt.show()
ss['target'] = predictions
ss.to_csv('Submission5.csv', index=False)
ss.head()

In [None]:
sns.displot(ss, x="target", kind="kde")