In [None]:
### Imports

import time

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import numpy as np 
import pandas as pd 

import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import category_encoders as ce

In [None]:
### Settings

SEED = 21
TEST_SIZE = 0.33
EPOCHS = 10
BATCH_SIZE = 128


In [None]:
### Create dataframes

TEST_FEATURES_PATH = "/kaggle/input/lish-moa/test_features.csv"
TRAIN_FEATURES_PATH = "/kaggle/input/lish-moa/train_features.csv"
TRAIN_TARGETS_PATH = "/kaggle/input/lish-moa/train_targets_scored.csv"
TRAIN_TARGETS_NONSCORED_PATH = "/kaggle/input/lish-moa/train_targets_nonscored.csv"
SAMPLE_SUB_PATH = "/kaggle/input/lish-moa/sample_submission.csv"

test_features_df = pd.read_csv(TEST_FEATURES_PATH).sort_values(by='sig_id')
train_features_df = pd.read_csv(TRAIN_FEATURES_PATH).sort_values(by='sig_id')
train_targets_df = pd.read_csv(TRAIN_TARGETS_PATH).sort_values(by='sig_id')
train_targets_nonscored_df = pd.read_csv(TRAIN_TARGETS_NONSCORED_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH).sort_values(by='sig_id')

In [None]:
# Encode training categorical features
enc = ce.BinaryEncoder(cols=['cp_type', 'cp_dose','cp_time']).fit(train_features_df)
train_features_enc_df = enc.transform(train_features_df).drop(columns=['sig_id'])


# Encode testing categorical features
enc = ce.BinaryEncoder(cols=['cp_type', 'cp_dose','cp_time']).fit(test_features_df)
test_features_enc_df = enc.transform(test_features_df).drop(columns=['sig_id'])

In [None]:
### Verify

train_features_enc_df.head()


In [None]:
### check how many 1's are in each class

value_counts_arr = np.sort([train_targets_df[col].value_counts()[1] for col in train_targets_df.columns])

print(value_counts_arr)

print(pd.Series(value_counts_arr).describe())

In [None]:
### Plot histogram of 1s counts in classes 

matplotlib.rcParams['figure.figsize'] = [10, 5]

plt.hist(value_counts_arr, 50, facecolor='g', alpha=0.75)
plt.xlabel('Number of 1\'s')
plt.ylabel('Number of classes')
plt.title('Value Counts of 1\'s in classes')
plt.show()

In [None]:
### Split training data into train/valid

x_train,x_valid,y_train,y_valid = train_test_split(train_features_enc_df,train_targets_df.drop(columns=['sig_id']),test_size=TEST_SIZE, random_state=SEED)


temp = 0

for col in y_train.columns:
    if len(np.unique(y_train[col])) == 1:
        print('Class {} only contains zeros'.format(col))
        temp = 1

if temp == 0:
    print('No classes have all zeros!')

In [None]:
### Verify
x_train.head()

In [None]:
def get_tf_model():
    model = tf.keras.Sequential([
        L.Flatten(input_shape=(1,879)),
        L.Dense(2000, activation='relu'),
        L.BatchNormalization(),
        L.Dropout(.4),
        L.Dense(1000, activation='relu'),
        L.BatchNormalization(),
        L.Dropout(.4),
        L.Dense(1000, activation='relu'),
        L.BatchNormalization(),
        L.Dropout(.4),
        L.Dense(206, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss = 'binary_crossentropy',
        metrics=['accuracy']
    )
    
    model.summary()
    
    return model

In [None]:
### Make Predictions

def get_preds(model,final=False):

    if 'tensorflow' in str(type(model)):
        if final==True:
            preds = np.array(model.predict(test_features_enc_df).astype("float64"))
        else:
            preds = np.array(model.predict(x_valid).astype("float64"))
    else:
        if final==True:
            preds = np.array(model.predict_proba(test_features_enc_df))
        else:
            preds = np.array(model.predict_proba(x_valid))
        
        preds = preds[:,:,1].T
    
    return preds

In [None]:
### Calculate validation score

def calc_loss(vals,preds):

    score = log_loss(np.ravel(vals),np.ravel(preds)) 

    print('Validation log loss score: {}'.format(score))
def run_model(model):

    ### fit the model
    fit_model(model)

    print('Getting validation predictions...')
    
    ### get the predictions
    temp_val_preds = get_preds(model,final=False)
    
    ### calculate log loss
    calc_loss(y_valid,temp_val_preds)

    val_preds.append(temp_val_preds)
    
    print('Calculating final predictions...')

    ### final preds
    final_preds.append(get_preds(model,final=True))
    
    print('Done')

In [None]:
val_preds = []
final_preds = []


In [None]:
run_model(model_2) 

In [None]:
### Ensemble validation predictions

print('Ensembling validation predictions')
val_preds_avg = np.mean(np.array(val_preds),axis=0)

print('Ensembling final predictions')
final_predictions = np.mean(np.array(final_preds),axis=0)

In [None]:
### Calculate ensemble validaiton loss

print('Calculating ensemble validation loss...')

calc_loss(y_valid,val_preds_avg)

In [None]:
### Insight into validation predictions

print(np.min(val_preds_avg))
print(np.max(val_preds_avg))
print(pd.DataFrame(val_preds_avg).describe())

In [None]:
### Output final predictions

sample_sub_df.iloc[:,1:] = final_predictions
sample_sub_df.to_csv('submission.csv',index=False)


In [None]:
### Insight into final predictions

sample_sub_df.describe()