In [None]:

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import KFold 
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.inspection import PartialDependenceDisplay
from lightgbm import LGBMClassifier
import warnings, gc, string, random
warnings.filterwarnings("ignore")
import plotly.figure_factory as ff

init_notebook_mode(connected=True)
color=px.colors.qualitative.Plotly
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=1000))

In [None]:
train=pd.read_csv('../input/tabular-playground-series-may-2022/train.csv', index_col='id')
test=pd.read_csv('../input/tabular-playground-series-may-2022/test.csv', index_col='id')
sub=pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)
train.head()

In [None]:
df=train.describe()
display(df.style.format('{:,.3f}')
        .background_gradient(subset=(df.index[1:],df.columns[:]), cmap='GnBu'))

In [None]:
target=train.target.value_counts(normalize=True)[::-1]
target

In [None]:
corr=train.corr().round(2)  
corr=corr.iloc[:-1,-1].sort_values(ascending=False)
titles=['Feature '+str(i.split('_')[1]) for i in corr.index]
corr.index=titles
pal=sns.color_palette("RdYlBu",32).as_hex()
pal=[j for i,j in enumerate(pal) if i not in (14,15)]
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.8)) for i in pal] 
fig=go.Figure()
fig.add_trace(go.Bar(x=corr.index, y=corr, marker_color=rgb,
                     marker_line=dict(color=pal,width=2),
                     hovertemplate='%{x} correlation with Target = %{y}',
                     showlegend=False, name=''))
fig.update_layout(template=temp, title='Feature Correlations with Target', 
                  yaxis_title='Correlation', xaxis_tickangle=45, width=800)
fig.show()


In [None]:
corr=train.iloc[:,:-1].corr().round(2)  
mask=np.triu(np.ones_like(corr, dtype=bool))
c_mask = np.where(~mask, corr, 100)
c=[]
for i in c_mask.tolist()[1:]:
    c.append([x for x in i if x != 100])
    
cor=c[::-1]
x=corr.index.tolist()[:-1]
y=corr.columns.tolist()[1:][::-1]
fig=ff.create_annotated_heatmap(z=cor, x=x, y=y,
                                hovertemplate='Correlation between %{x} and %{y}= %{z}',
                                colorscale='emrld', reversescale=True, name='')
fig.update_layout(template=temp, title='Correlations between Features',
                  yaxis=dict(showgrid=False,autorange="reversed"),
                  xaxis=dict(showgrid=False), height=1000,width=1000)
fig.show()

In [None]:
train['f_27'].value_counts()

In [None]:
enc = OrdinalEncoder()
def feature_eng(df):
    df=df.copy()
    df['char_unique']=df['f_27'].apply(lambda x: len(set(x)))
    
    for i in range(df.f_27.str.len().max()):
        df['f_27_char{}'.format(i+1)]=enc.fit_transform(df['f_27'].str.get(i).values.reshape(-1,1))
    return df.drop(['f_27'],axis=1)

train_df=feature_eng(df=train)
test_df=feature_eng(df=test)

In [None]:
train_df.iloc[:,31:44]

In [None]:

df=train[['f_27']]
for letter in string.ascii_uppercase:
    df['{}'.format(letter)]=df['f_27'].str.count(letter)
df_sum=df.iloc[:,1:].sum(axis=0).rename('sum').reset_index()
pal=sns.color_palette("Spectral_r",28).as_hex()
pal=[j for i,j in enumerate(pal) if i !=14]
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.8)) for i in pal] 
fig = go.Figure()
fig.add_trace(go.Bar(x=df_sum['index'], y=df_sum['sum'], marker_color=rgb, 
                     marker_line=dict(color=pal,width=2), name='',
                     hovertemplate='Letter: %{x}, Frequency: %{y}',
                     showlegend=False))
fig.update_layout(template=temp,title="Most Common Letters",
                  yaxis_title="Frequency", width=800)
fig.show()

In [None]:
scaler = StandardScaler()
y=train_df['target']
X=train_df.drop(['target'], axis=1)
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
X_test=pd.DataFrame(scaler.transform(test_df))


In [None]:

y_valid, gbm_val_preds, gbm_test_preds=[],[],[]
cal_true, cal_pred=[],[]
feat_importance=pd.DataFrame(index=X.columns)
k_fold = KFold(n_splits=5, shuffle=True, random_state=21)
for fold, (train_idx, val_idx) in enumerate(k_fold.split(X, y)):
    
    print("\nFold {}".format(fold+1))
    X_train, y_train = X.iloc[train_idx,:], y[train_idx]
    X_val, y_val = X.iloc[val_idx,:], y[val_idx]
    print("Train shape: {}, {}, Valid shape: {}, {}".format(
        X_train.shape, y_train.shape, X_val.shape, y_val.shape))
    
    params = {'boosting_type': 'gbdt',
              'n_estimators': 250,
              'num_leaves': 50,
              'learning_rate': 0.1,
              'colsample_bytree': 0.9,
              'subsample': 0.8,
              'reg_alpha': 0.1,
              'objective': 'binary',
              'metric': 'auc',
              'random_state': 21}
    
    gbm = LGBMClassifier(**params).fit(X_train, y_train, 
                                       eval_set=[(X_train, y_train), (X_val, y_val)],
                                       verbose=100,
                                       eval_metric=['binary_logloss','auc'])
    
    gbm_prob = gbm.predict_proba(X_val)[:,1]
    y_valid.append(y_val)
    gbm_val_preds.append(gbm_prob)
    gbm_test_preds.append(gbm.predict_proba(X_test)[:,1])
    feat_importance["Importance_Fold"+str(fold)]=gbm.feature_importances_
    
    calibrated_gbm = CalibratedClassifierCV(base_estimator=gbm, cv="prefit")
    cal_fit = calibrated_gbm.fit(X_train, y_train)
    cal_probs = calibrated_gbm.predict_proba(X_val)[:, 1]
    prob_true, prob_pred = calibration_curve(y_val, cal_probs, n_bins=10)
    cal_true.append(prob_true)
    cal_pred.append(prob_pred)
    auc_score=roc_auc_score(y_val, gbm_prob)
    print("Validation AUC = {:.4f}".format(auc_score))
      
    del X_train, y_train, X_val, y_val
    gc.collect()  

In [None]:
feat_importance['avg']=feat_importance.mean(axis=1)
feat_importance=feat_importance.sort_values(by='avg',ascending=True)

pal=sns.color_palette("YlGnBu", 52).as_hex()
fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['avg'][i], 
                       line_color=pal[::-1][i],opacity=0.8,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['avg'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.0f}<extra></extra>'))
fig.update_layout(template=temp,title='Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, height=900, width=800)
fig.show()

In [None]:
sub_gbm=sub.copy()
sub_gbm['target']=np.mean(gbm_test_preds, axis=0)
# sub_gbm.to_csv("sub_gbm.csv", index=False)
sub_gbm.to_csv("sample_submission.csv", index=False)

In [None]:
sub_gbm

# **Trying with Neural Network**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, InputLayer, Add
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import metrics, regularizers
from tensorflow.keras.utils import plot_model

tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
def nn_model():
    x_input = Input(shape=(41))
    x = Dense(512, kernel_regularizer=regularizers.l2(1e-5),
              activation='swish')(x_input)
    x = Dense(384, kernel_regularizer=regularizers.l2(1e-5),
              activation='swish')(x)
    x = Dense(256, kernel_regularizer=regularizers.l2(1e-5),
              activation='swish')(x)
    x = Dense(128, kernel_regularizer=regularizers.l2(1e-5),
              activation='swish')(x)
    x = Dense(64, kernel_regularizer=regularizers.l2(1e-5),
              activation='swish')(x)
    output = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=x_input, outputs=output)
    
    return model

# def nn_model():
#     activation = 'swish'
#     inputs = Input(shape=(41))
#     x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
#               activation=activation,
#              )(inputs)
#     x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
#               activation=activation,
#              )(x)
#     x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
#               activation=activation,
#              )(x)
#     x = Dense(16, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
#               activation=activation,
#              )(x)
#     x = Dense(1, #kernel_regularizer=tf.keras.regularizers.l2(1e-6),
#               activation='sigmoid',
#              )(x)
#     model = Model(inputs, x)
#     return model

In [None]:
model = nn_model()

In [None]:
plot_model(model, show_layer_names=False, show_shapes=True)

In [None]:
y=train_df['target']
X=train_df.drop(['target'], axis=1)
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
X_test=pd.DataFrame(scaler.transform(test_df))

y_valid, nn_val_preds, nn_test_preds=[],[],[]
cal_true, cal_pred=[],[]
k_fold = KFold(n_splits=5, shuffle=True, random_state=21)

np.random.seed(1)
random.seed(1)
tf.random.set_seed(1)

for fold, (train_idx, val_idx) in enumerate(k_fold.split(X, y)):
    
    print("\n*****Fold {}*****".format(fold+1))
    X_train, y_train = X.iloc[train_idx,:], y[train_idx]
    X_val, y_val = X.iloc[val_idx,:], y[val_idx]
    print("Train shape: {}, {}, Valid shape: {}, {}".format(
        X_train.shape, y_train.shape, X_val.shape, y_val.shape))
    
    with tpu_strategy.scope():

        model = nn_model()
        
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                      loss=tf.keras.losses.BinaryCrossentropy(),
                      metrics=[metrics.AUC(name = 'auc')])
        
        lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5,  patience=3, verbose=True)
        es = EarlyStopping(monitor='val_auc', mode='max', patience=5, 
                           restore_best_weights=True, verbose=True)
        
        model.fit(X_train, y_train,
                  validation_data=(X_val, y_val), 
                  epochs=50, batch_size=4096, 
                  callbacks=[es,lr], verbose=True, shuffle=True)
        
        nn_preds = model.predict(X_val).squeeze()
        y_valid.append(y_val)
        nn_val_preds.append(nn_preds)
        nn_test_preds.append(model.predict(X_test).squeeze())
        
        prob_true, prob_pred = calibration_curve(y_val, nn_preds, n_bins=10)
        cal_true.append(prob_true)
        cal_pred.append(prob_pred)
      
    del X_train, y_train, X_val, y_val
    gc.collect()  

In [None]:
sub_nn=sub.copy()
sub_nn['target']=np.mean(nn_test_preds, axis=0)
sub_nn.to_csv("sub_nn.csv", index=False)