# About this notebook
This notebook is going to be my first contact with this competition. I.E., before reading the currently created notebooks, I want to take a fast exploration and submission by myself to get an idea on how this dataset looks like and what are the main challenges

## 1. Import Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
test_features=pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
sample_submission=pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
train_features=pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scores=pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
#train_targets_nonscored=pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv') ignoring as it is optional

## 2. Quick EDA

In [None]:
sample_submission.head()

In [None]:
test_features.head()

In [None]:
test_features.info()

In [None]:
test_features.columns

In [None]:
train_features.info()

In [None]:
train_targets_scores.info()

In [None]:
train_features.describe()

In [None]:
train_targets_scores.describe()

In [None]:
train_features.head()

In [None]:
train_features.sig_id.value_counts().head()

In [None]:
for col in ['cp_type', 'cp_time', 'cp_dose']:
    print(train_features[col].value_counts())
    print('='*80)

## 3. Preparing the Data

In [None]:
# Checking if IDs are aligned, if yes, I can drop them
assert sum(train_features.sig_id!=train_targets_scores.sig_id)==0

In [None]:
X_train=train_features.drop('sig_id', axis=1)

In [None]:
X_train.cp_time=X_train.cp_time.astype('str')

In [None]:
X_train.head(2)

In [None]:
cat_cols=['cp_type', 'cp_time', 'cp_dose']

In [None]:
num_cols=list(X_train.drop(cat_cols, axis=1).columns)

In [None]:
from sklearn.preprocessing import RobustScaler
def prepare_X(X, cat_cols, num_cols):
    X_cat=pd.get_dummies(X[cat_cols])
    scaler=RobustScaler()
    X_num=scaler.fit_transform(X[num_cols])
    X[num_cols]=X_num
    return pd.concat([X_cat, X[num_cols]], axis=1)

X_train=prepare_X(X_train, cat_cols, num_cols)
X_train.head()

In [None]:
y_train = train_targets_scores.drop('sig_id', axis=1)
y_train.head()

In [None]:
# Repeating the same for test and sample submission
assert sum(test_features.sig_id!=sample_submission.sig_id)==0

In [None]:
X_test = test_features.drop('sig_id', axis=1)
X_test.cp_time=X_test.cp_time.astype('str')
X_test = prepare_X(X_test, cat_cols, num_cols)
X_test.head()

In [None]:
X_train.head()

## 4. Importing and Training Baseline Model

In [None]:
input_units=X_train.shape[1]

In [None]:
output_units=y_train.shape[1]

In [None]:
batch_size=X_train.shape[0]//150 #50 iterations per epoch

In [None]:
batch_size

In [None]:
# Import keras dependencies here
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

# Initialize Checkpoint
mc = ModelCheckpoint(filepath='best_model.hdf5', save_best_only=True,
                     verbose=1, monitor='val_loss')

# Define your base model here
model = Sequential([
  Dense(units=input_units//2, input_shape=(input_units,), activation='relu'),
  Dense(output_units, activation='sigmoid')
])


# Set your optimizer and loss function here
model.compile(optimizer=Adam(lr=1e-4),
             loss='binary_crossentropy')

In [None]:
# 3. Train your model
model.fit(X_train, y_train, validation_split=0.2, batch_size=batch_size, 
          epochs=50, callbacks=[mc])

In [None]:
model.load_weights('best_model.hdf5')

In [None]:
# Set your optimizer and loss function here
model.compile(optimizer=Adam(lr=1e-5),
             loss='binary_crossentropy')
model.fit(X_train, y_train, validation_split=0.2, batch_size=batch_size, 
          epochs=50, callbacks=[mc])

In [None]:
model.load_weights('best_model.hdf5')

In [None]:
y_pred = model.predict(X_train.head(5))

In [None]:
y_pred

In [None]:
y_pred.shape

In [None]:
X_test.shape

In [None]:
X_test.shape

In [None]:
y_pred_test=model.predict(X_test)

## 5. Prepare Submission

In [None]:
pred_cols = sample_submission.drop('sig_id', axis=1).columns

In [None]:
sample_submission[pred_cols]=y_pred_test

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)