In [None]:
import h5py
from keras.models import Model   
from keras.layers import * 
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import plot_model
from keras.callbacks import EarlyStopping # Early Stopping Callback in the NN
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from operator import itemgetter

import os
for dirname, _, filenames in os.walk('/kaggle/input/shopee-code-league-20/_DA_Marketing_Analytics'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
sample_submission = pd.read_csv('/kaggle/input/shopee-code-league-20/_DA_Marketing_Analytics/sample_submission_0_1.csv')
sample_submission.head()

In [None]:
users = pd.read_csv('/kaggle/input/shopee-code-league-20/_DA_Marketing_Analytics/users.csv')
users.head()

In [None]:
def preprocessing(df):
    
    df = df.merge(users, how='left', on='user_id')
    df['grass_date'] = pd.to_datetime(df['grass_date'])
    
    for item in ['open', 'login', 'checkout']:
        df['last_'+item+'_day'] = df['last_'+item+'_day'].replace('Never '+item, np.nan)
        df['never_'+item] = df['last_'+item+'_day'].isnull().astype(int)
    
    # fill na with mode
    df = df.fillna(df.mode().iloc[0])
    
    for item in ['open', 'login', 'checkout']:
        df['last_'+item+'_day'] = df['last_'+item+'_day'].astype(int)
    
    for col in df.columns:
        sns.displot(df[col])
        plt.xlabel(str(col))
        plt.show()
        
    # convert domain to dummy
    df = pd.get_dummies(df, prefix=['domain'])
    
    # convert country code to dummy
    df['country_code'] = df['country_code'].astype(str)
    df = pd.get_dummies(df, prefix=['country_code'])
    print(df.dtypes)
    return df
    

In [None]:
def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    y_actual = y_actual.reset_index(drop=True)
    y_pred = y_pred.reset_index(drop=True)

    for i in range(len(y_pred)): 
        if y_actual[i]==y_pred[i]==1:
            TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
            FP += 1
        if y_actual[i]==y_pred[i]==0:
            TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
            FN += 1
        
    print(np.round((((TP*TN)-(FP*FN))/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)),5))

In [None]:
train = pd.read_csv('/kaggle/input/shopee-code-league-20/_DA_Marketing_Analytics/train.csv')
train = preprocessing(train)
train.head()

In [None]:
#test = pd.read_csv('/kaggle/input/shopee-code-league-20/_DA_Marketing_Analytics/test.csv')
#test = preprocessing(test)
train, test = train_test_split(train, test_size=0.1, random_state=42)
test.head()

In [None]:
normalize = MinMaxScaler()
num_train = normalize.fit_transform(train[train.columns.difference(['user_id','grass_date','open_flag'])].values)
num_test = normalize.transform(test[test.columns.difference(['user_id','grass_date','open_flag'])].values)
y_train = train['open_flag'].values

# NN Model

In [None]:
import tensorflow as tf
from keras import backend as K
def matthews_correlation_coefficient(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))

    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    return num / K.sqrt(den + K.epsilon())

In [None]:
inputNum = Input(shape=(num_train.shape[1],))

output = Dense(units=128,activation='relu')(inputNum)
output = Dropout(0.1)(output)
output = Dense(units=128,activation='relu')(output)
output = Dropout(0.1)(output)
output = Dense(units=64,activation='relu')(output)
output = Dropout(0.1)(output)
output = Dense(units=32,activation='relu')(output)
output = Dense(units=16,activation='relu')(output)
output = Dense(units=8,activation='relu')(output)
output = Dense(units=1,activation='sigmoid')(output)

model = Model(inputNum,output)
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[matthews_correlation_coefficient])
model.summary()

In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
#early_stop = EarlyStopping(monitor='val_matthews_correlation_coefficient', mode='max', verbose=1, patience=5)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_matthews_correlation_coefficient', mode='max', verbose=1, save_best_only=True)
hist = model.fit(num_train,y_train,epochs=50,batch_size=10,shuffle=True,validation_split=0.1,callbacks=[early_stop, checkpoint])

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot()
plt.show()

In [None]:
epoch_with_highest_val_mcc = max(enumerate(model.history.history['val_matthews_correlation_coefficient']), key=itemgetter(1))[0]+1
epoch_with_highest_val_mcc

In [None]:
epoch_with_lowest_val_loss = min(enumerate(model.history.history['val_loss']), key=itemgetter(1))[0]+1
epoch_with_lowest_val_loss

In [None]:
try:
    # cant load model because mcc is not recognised as a metric
    dependencies = {'val_matthews_correlation_coefficient': matthews_correlation_coefficient}
    model = load_model('best_model.h5', custom_objects=dependencies)
    print('best model loaded')
except:
    print('could not load best model')
    model.fit(num_train,y_train,epochs=epoch_with_highest_val_mcc ,batch_size=10,shuffle=True)
finally:
    pred = (model.predict(num_test) > 0.5).astype("int32")

In [None]:
#pred = (model.predict(num_test) > 0.5).astype("int32")
result = pd.DataFrame()
result['open_flag'] = pred.tolist()
result['open_flag'] = [','.join(map(str, l)) for l in result['open_flag']]
result['open_flag']  = result['open_flag'].astype(int)
result['row_id'] = result.index
result[sample_submission.columns].head()

In [None]:
perf_measure(test['open_flag'], result['open_flag'])

# Logreg

In [None]:
clf = LogisticRegression(random_state=0).fit(num_train,y_train)
pred2 = clf.predict(num_test)

In [None]:
result2 = pd.DataFrame()
result2['open_flag'] = pred2
result2['row_id'] = result.index
result2[sample_submission.columns].head()

In [None]:
perf_measure(test['open_flag'], result2['open_flag'])