In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import random
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.ensemble import *
from sklearn.metrics import *
from scipy.stats import pearsonr
%matplotlib inline

We first read the csv data.

In [None]:
# read csv data
df_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv',index_col='sig_id')
df_test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv',index_col='sig_id')
df_sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv', index_col='sig_id')
df_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv',index_col='sig_id')

df_features_targets = pd.concat([df_features, df_targets])

In [None]:
# look at training data
df_features.head()

In [None]:
# look at test data
df_test_features.head()

In [None]:
# look at columns
df_features.columns

# 875 columns [cp_type, cp_time, cp_does, g-0 ... g-771, c-0, c-99]

In [None]:
fig, (cp_type_bar, cp_dose_bar) = plt.subplots(nrows=1, ncols=2, figsize=[12, 6])

# plot frequency of cp_type
cp_type_training_count = df_features['cp_type'].value_counts()
cp_type_test_count = df_test_features['cp_type'].value_counts()
cp_type_label = cp_type_training_count.index
cp_type_width = 1.0
cp_type_bar.bar([0, 3], cp_type_training_count, width=cp_type_width)
cp_type_bar.bar([1, 4], cp_type_test_count, width=cp_type_width)
cp_type_bar.set_xticks([0.5, 3.5])
cp_type_bar.set_xticklabels(cp_type_label)
cp_type_bar.set_title('Frequency of cp_dose in training')

# plot frequency of cp_dose
cp_dose_training_count = df_features['cp_dose'].value_counts()
cp_dose_test_count = df_test_features['cp_dose'].value_counts()
cp_dose_label = cp_dose_training_count.index
cp_dose_width = 1.0
cp_dose_bar.bar([0, 3], cp_dose_training_count, width = cp_dose_width)
cp_dose_bar.bar([1, 4], cp_dose_test_count, width = cp_dose_width)
cp_dose_bar.set_xticks([0.5, 3.5])
cp_dose_bar.set_xticklabels(cp_dose_label)
cp_dose_bar.set_title('Frequency of cp_dose in training')

plt.show()

In [None]:
nrows, ncols = 3, 3
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=[36,24])
fig.tight_layout(pad=12.0)
cmap = plt.cm.get_cmap("tab10")
colors = cmap.colors
# plot pdf for 9 random g- features
for i in range(nrows):
    for j in range(ncols):
        feature = random.randint(0, 771)
        axis = ax[i][j]
        axis.hist(df_features[f'g-{feature}'], bins=100, density=True, color=colors[2 * i + j])
        axis.set_title(f'pdf for g-{feature}', {'fontsize': 32})
        axis.set_xlabel("Numerical value in training set", {'fontsize': 18})
        axis.set_ylabel("Probability density", {'fontsize': 18})

plt.show()

In [None]:
# Some slightly skewed data for features in g
nrows, ncols = 2, 2
fig, ax = plt.subplots(figsize=[24, 18], nrows=nrows, ncols=ncols)
skewed_g = [['g-744', 'g-123'], ['g-489', 'g-644']]
cmap = plt.cm.get_cmap("Set2")
colors = cmap.colors
for i in range(nrows):
    for j in range(ncols):
        axis = ax[i][j]
        axis.hist(df_features[skewed_g[i][j]], bins=100, density=True, color=colors[2 * i + j])
        axis.set_title(f'pdf for {skewed_g[i][j]}')
        axis.set_xlabel("Numerical value in training set", {'fontsize': 18})
        axis.set_ylabel("Probability density", {'fontsize': 18})
plt.show()
#g-744,g-123 g-489, g-644g-23, #g-644, g-413, g-307, g-238

In [None]:
# Some slightly skewed data for features in g
nrows, ncols = 2, 2
fig, ax = plt.subplots(figsize=[24, 18], nrows=nrows, ncols=ncols)
skewed_g = [['g-23', 'g-413'], ['g-307', 'g-238']]
cmap = plt.cm.get_cmap("Set2")
colors = cmap.colors
for i in range(nrows):
    for j in range(ncols):
        axis = ax[i][j]
        axis.hist(df_features[skewed_g[i][j]], bins=100, density=True, color=colors[2 * i + j])
        axis.set_title(f'pdf for {skewed_g[i][j]}', {'fontsize': 24})
        axis.set_xlabel("Numerical value in training set", {'fontsize': 16})
        axis.set_ylabel("Probability density", {'fontsize': 16})
plt.show()

In [None]:
# Peer into frequency count for g-307
g_307_series = df_features.groupby(pd.cut(df_features['g-307'], 100))['g-307'].count()
g_307_series.sort_values(ascending=False)

In [None]:
g_307_nlargest_series = g_307_series.nlargest(10)
fig, ax = plt.subplots(figsize=[18, 10])
ax.bar(range(10), g_307_nlargest_series)
ax.set_xticks(range(10))
ax.set_xticklabels(g_307_nlargest_series.index)
ax.set_title("Intervals with highest frequency in training for g-307", {'fontsize': 18})

plt.show()

In [None]:
nrows, ncols = 3, 3
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=[36,24])
fig.tight_layout(pad=12.0)
cmap = plt.cm.get_cmap("tab10")
colors=cmap.colors
# plot pdf for 9 random g- features
for i in range(nrows):
    for j in range(ncols):
        feature = random.randint(0, 99)
        axis = ax[i][j]
        axis.hist(df_features[f'c-{feature}'], bins=100, density=True, color=colors[i*2+j])
        axis.set_title(f'pdf for c-{feature}', {'fontsize': 32})
        axis.set_xlabel("Numerical value in training set", {'fontsize': 18})
        axis.set_ylabel("Probability density", {'fontsize': 18})

plt.show()

In [None]:
# RobustScalar transforms the feature vector by subtracting the median and then dividing by the interquartile range (25% - 75%)
df_copy = df_features.copy(deep=True)
df_copy['cp_type'] = df_copy['cp_type'].apply(lambda x: 1 if x == "ctl_vehicle" else 0)
df_copy['cp_dose'] = df_copy['cp_dose'].apply(lambda x: 1 if x == "D2" else 0)
scaler = RobustScaler()
X = df_copy.values
X = scaler.fit_transform(X)

df_X = pd.DataFrame(X, columns=df_copy.columns)

In [None]:
# New pdf after applying RobustScalar
feature = random.randint(3, 771)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[12,10])
ax.hist(df_features[f'g-{feature}'], density=True, bins=100)
ax.hist(df_X[f'g-{feature}'],density=True, bins=100, alpha=0.5, color='red')
ax.legend(['Before', 'After'])
plt.show()

In [None]:
# New pdf for 9 random features
nrows, ncols = 3, 3

fig, ax = plt.subplots(nrows=3, ncols=3, figsize=[36,24])

for i in range(nrows):
    for j in range(ncols):
        axis = ax[i][j]
        feature = random.randint(0, 875)
        column_name = df_features.columns[feature]
        axis.hist(df_features.iloc[:, feature], density=True, bins=100)
        axis.hist(df_X.iloc[:, feature],density=True, bins=100, alpha=0.5, color='red')
        axis.set_title(f'pdf for {df_features.columns[feature]}', {'fontsize': 30})
        axis.legend(['Before', 'After'])

plt.show()

In [None]:
# Robust Scalar on slightly skewed data
nrows, ncols = 2, 2
fig, ax = plt.subplots(figsize=[24, 18], nrows=nrows, ncols=ncols)
skewed_g = [['g-23', 'g-413'], ['g-307', 'g-238']]
cmap = plt.cm.get_cmap("Set2")
colors = cmap.colors
for i in range(nrows):
    for j in range(ncols):
        axis = ax[i][j]
        axis.hist(df_features[skewed_g[i][j]], bins=100, density=True)
        axis.hist(df_X[skewed_g[i][j]], bins=100, density=True, color='red', alpha=0.5)
        axis.set_title(f'pdf for {skewed_g[i][j]}', {'fontsize': 24})
        axis.set_xlabel("Numerical value in training set", {'fontsize': 16})
        axis.set_ylabel("Probability density", {'fontsize': 16})
plt.show()

In [None]:
# types of target columns
for column in df_targets.columns:
    print(column)

In [None]:
# Number of targets to predict
num_targets_dict = dict()
for index, row in df_targets.iterrows():
    num_targets = np.sum(row)
    num_targets_dict[num_targets] = 1 + num_targets_dict[num_targets] if num_targets in num_targets_dict else 1


fig, ax = plt.subplots(figsize=[12,10])
ax.bar(num_targets_dict.keys(), num_targets_dict.values())
ax.set_xlabel("Number of targets", {'fontsize': 14})
ax.set_title("Frequency of test instance for x number of targets", {'fontsize': 18})
plt.show()

In [None]:
correlation_matrix = np.corrcoef(df_copy, df_targets, False)
# remove instances where feature is correlated to feature, and target is correlated to target
correlation_features = correlation_matrix[:875,875:] # shape 875 x 206
df_correlation = pd.DataFrame(correlation_features, index=df_features.columns, columns=df_targets.columns)

In [None]:
# plot correlation matrix for 10 features and 10 targets
feature = random.randint(0, 865)
target = random.randint(0, 196)

correlation_features_submatrix = correlation_features[feature:feature + 10, target: target + 10]
fig, ax = plt.subplots(figsize=[10,10])
ax.imshow(correlation_features_submatrix)

# Loop over data dimensions and create text annotations.
for i in range(10):
    for j in range(10):
        text = ax.text(j, i, round(correlation_features_submatrix[i, j], 4),
                       ha="center", va="center", color="w")

ax.set_yticks(range(10))
ax.set_yticklabels([df_features.columns[f] for f in np.arange(feature, feature +11)])
ax.set_xticks(range(10))
ax.set_xticklabels([df_targets.columns[t] for t in np.arange(target, target + 11)])
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
df_correlation.head()
df_correlation['hdac_inhibitor'].nlargest(20)

In [None]:
# # %% [code]
# def change_df(df):
#     df['cp_type'] = df['cp_type'].apply(lambda x: "1" if x == "ctl_vehicle" else 0)
#     df['cp_dose'] = df['cp_dose'].apply(lambda x: "1" if x == "D2" else 0)
#     return df

# # transform training instances of 'cp_type' and 'cp_dose' to boolean attribute
# df_features = change_df(df_features)
# # transform test instances of 'cp_type' and 'cp_does' to boolean attribute
# df_test_features = change_df(df_test_features)

# # %% [code]
# df_features

# # %% [code]
# from sklearn.model_selection import *
# from sklearn.preprocessing import *
# from sklearn.ensemble import *
# from sklearn.metrics import *

# # %% [code]
# # change from numpy data frame to array
# X = df_features.values
# y = df_targets.values
# X_test = df_test_features.values

# # %% [code]
# #scaler = MinMaxScaler(feature_range=(1,2))

# # RobutScalar transforms the feature vector by subtracting the median and then dividing by the interquartile range (25% - 75%)
# scaler = RobustScaler()
# X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

# # %% [code]
# # Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree
# poly = PolynomialFeatures(2)
# poly.fit(X)

# # %% [code]
# import keras
# import tensorflow as tf
# import tensorflow_addons as tfa
# from keras.models import *
# from keras.layers import *
# from keras.backend import *
# from keras.callbacks import *
# from keras.optimizers import *

# # %% [code]
# batch_size = 512
# class FeatureGenerator(keras.utils.Sequence):
#     'Generates data for Keras'
#     def __init__(self, X, y, batch_size=32, shuffle=True):
#         'Initialization'
#         self.X = X
#         self.y = y
#         self.batch_size = batch_size
#         self.shuffle = shuffle
#         self.on_epoch_end()

#     def __len__(self):
#         'Denotes the number of batches per epoch'
#         return int(np.floor(len(self.X) / self.batch_size)) + 1

#     def __getitem__(self, index):
#         'Generate one batch of data'
#         # Generate indexes of the batch

#         indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
#         X_batch = self.X[indexes]
#         y_batch = self.y[indexes]
        
#         #X_batch = poly.transform(X_batch)
#         #X_batch = np.concatenate([X_batch, np.log(X_batch), np.exp(X_batch)], axis=1)
#         y_batch = y_batch.astype('float')
#         return X_batch, y_batch

#     def on_epoch_end(self):
#         'Updates indexes after each epoch'
#         self.indexes = np.arange(len(self.X))
#         if self.shuffle == True:
#             np.random.shuffle(self.indexes)

# # %% [code]
# def get_model():
#     model_input = Input(shape=(len(train_generator.__getitem__(0)[0][0]), ))

#     #x = GaussianNoise(0.4)(model_input)
#     """x = Dense(256, activation='swish')(model_input)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x)

#     x = expand_dims(x, axis=1)
#     x_1 = Conv1D(256, 1, padding='same', use_bias=False, kernel_initializer='he_normal', activation='swish')(x)
#     x_2 = Conv1D(256, 1, padding='same', use_bias=False, kernel_initializer='he_normal', activation='swish')(x)
#     x_att = Multiply()([x_1, x_2])
#     x_att = Activation('softmax')(x_att)

#     x_g = Conv1D(256, 1, padding='same', use_bias=False, kernel_initializer='he_normal', activation='swish')(x)
#     x_y = Multiply()([x_g, x_att])

#     x = Add()([x, x_y])
#     x = squeeze(x, axis=1)

#     """
#     x = Dense(1024, activation='swish')(model_input)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x)
#     x_1 = x

#     x = Dense(512, activation='swish')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x)
#     x_2 = x

#     x = Concatenate()([x, x_1])
#     x = Dense(256, activation='swish')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x)

#     x = Concatenate()([x, x_2])
#     model_output = Dense(len(y_train[0]), activation='sigmoid')(x)
#     model = Model(inputs = model_input,outputs = model_output)
#     model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
#     model.summary()
#     return model

# """
# model = Sequential()
# model.add(GaussianNoise(0.01, input_dim=len(train_generator.__getitem__(0)[0][0])))
# model.add(Dense(512, activation='swish'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(Dense(256, activation='swish'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(Dense(len(y_train[0]), activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
# #tfa.losses.SigmoidFocalCrossEntropy
# model.summary()"""


# # %% [code]
# kf = KFold(n_splits=5)
# models = []
# losses = []
# for train_index, valid_index in kf.split(X):
#     X_train, y_train = X[train_index], y[train_index]
#     X_valid, y_valid = X[valid_index], y[valid_index]
#     train_generator = FeatureGenerator(X_train, y_train, batch_size=batch_size)
#     print("HERE")
#     print(len(train_generator))
#     print("AFTER")
#     valid_generator = FeatureGenerator(X_valid, y_valid, batch_size=batch_size, shuffle=False)
#     test_generator = FeatureGenerator(X_test, np.zeros(len(X_test)), batch_size=batch_size, shuffle=False)
    
#     model = get_model()
    
#     reducelr = ReduceLROnPlateau(patience=3,verbose=1)
#     checkpoint = ModelCheckpoint('/checkpoint', monitor='val_loss', verbose=1, save_best_only=True,save_weights_only=True)
#     earlystopping = EarlyStopping(patience=10)
    
#     #model.fit(train_generator, epochs=200, batch_size=batch_size, validation_data=valid_generator, callbacks=[reducelr, checkpoint, earlystopping])
    
#     #models.append(model)
    
#     #print(log_loss(y_valid, model.predict(valid_generator)) / len(y_valid[0]))
#     #losses.append(log_loss(y_valid, model.predict(valid_generator)) / len(y_valid[0]))

# # %% [code]
# print(losses)

# # %% [code]
# test_generator = FeatureGenerator(X_test, np.zeros(len(X_test)), batch_size=batch_size, shuffle=False)
# y_test = np.mean([model.predict(test_generator) for model in models], axis=0)

# # %% [code]
# df_sample_submission[:] = y_test
# df_sample_submission

# # %% [code]
# df_sample_submission.to_csv('submission.csv')

# # %% [code]