In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

In [None]:
train_features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv")

In [None]:
train_features.head()

In [None]:
train_targets_scored = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")

All of the ctl_vehicle rows don't have a a MOA because there was no treatment (mentioned in data description). I am going to use all of these for training (should have also taken the ones from the test data), because I can use them for normalization of the treatments in the different dose and time conditions. 

I calculated 50 PCAs for the scaled genes and cell viabiability values. Then the mean vectors of PC values for the ctl_vehicles in each dose+time condition was subtracted from the treatments then those were all converted to unit vectors so magnitudes of changes would be less important than the direction of change. Then these 100 PCA dimensions of each treatment condition (50 gene + gene cell viability) were used to train a fully conected neural network 

In [None]:
train_features.shape

In [None]:
train_features['cp_type'].value_counts()

In [None]:
train_features['sig_id'].value_counts() # all different IDs

In [None]:
train_targets_scored[train_features['cp_type']=="ctl_vehicle"].sum(axis=1).value_counts()

In [None]:
ctl_train_features = train_features[train_features['cp_type']=="ctl_vehicle"]

In [None]:
train_targets_scored = train_targets_scored[train_features['cp_type'] != "ctl_vehicle"]

In [None]:
train_features = train_features[train_features['cp_type'] != "ctl_vehicle"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_targets_scored, test_size=0.20, random_state=11)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
X_train_with_ctl = pd.concat([X_train,ctl_train_features],axis=0)

In [None]:
X_train.shape

In [None]:
X_train_with_ctl.shape

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train_with_ctl.iloc[:,4:])
X_train_scaled = scaler.transform(X_train_with_ctl.iloc[:,4:])

In [None]:
pca_genes = PCA(n_components=50)
pca_genes.fit(X_train_scaled[:,0:-100])

In [None]:
pca_genes.explained_variance_ratio_

In [None]:
plt.plot(np.cumsum(pca_genes.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
plt.plot(np.arange(0,50), pca_genes.explained_variance_ratio_)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained');

In [None]:
pca_genes_matrix = pca_genes.transform(X_train_scaled[:,0:-100])

In [None]:
pca_genes_matrix.shape

In [None]:
pca_viability = PCA(n_components=50)
pca_viability.fit(X_train_scaled[:,-100:])

In [None]:
pca_viability.explained_variance_ratio_

In [None]:
plt.plot(np.cumsum(pca_viability.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
plt.plot(np.arange(0,50), pca_viability.explained_variance_ratio_)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained');

In [None]:
pca_viability_matrix = pca_viability.transform(X_train_scaled[:,-100:])

In [None]:
X_train_with_ctl[X_train_with_ctl["cp_type"]=="ctl_vehicle"]['cp_dose'].value_counts()

In [None]:
X_train_with_ctl[X_train_with_ctl["cp_type"]=="ctl_vehicle"]['cp_time'].value_counts()

In [None]:
X_train_with_ctl[X_train_with_ctl["cp_type"]!="ctl_vehicle"]['cp_dose'].value_counts()

In [None]:
X_train_with_ctl[X_train_with_ctl["cp_type"]!="ctl_vehicle"]['cp_time'].value_counts()

In [None]:
X_train_with_ctl.reset_index(inplace=True)

In [None]:
D1_48 = X_train_with_ctl[(X_train_with_ctl["cp_type"]=="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==48) &\
                                  (X_train_with_ctl['cp_dose']=='D1')].index

In [None]:
D1_72 = X_train_with_ctl[(X_train_with_ctl["cp_type"]=="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==72) &\
                                  (X_train_with_ctl['cp_dose']=='D1')].index

In [None]:
D1_24 = X_train_with_ctl[(X_train_with_ctl["cp_type"]=="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==24) &\
                                  (X_train_with_ctl['cp_dose']=='D1')].index

In [None]:
D2_72 = X_train_with_ctl[(X_train_with_ctl["cp_type"]=="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==72) &\
                                  (X_train_with_ctl['cp_dose']=='D2')].index

In [None]:
D2_48 = X_train_with_ctl[(X_train_with_ctl["cp_type"]=="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==48) &\
                                  (X_train_with_ctl['cp_dose']=='D2')].index

In [None]:
D2_24 = X_train_with_ctl[(X_train_with_ctl["cp_type"]=="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==24) &\
                                  (X_train_with_ctl['cp_dose']=='D2')].index

In [None]:
D1_72_gene_pc = pca_genes_matrix[D1_72].mean(axis=0)
D1_48_gene_pc = pca_genes_matrix[D1_48].mean(axis=0)
D1_24_gene_pc = pca_genes_matrix[D1_24].mean(axis=0)
D2_72_gene_pc = pca_genes_matrix[D2_72].mean(axis=0)
D2_48_gene_pc = pca_genes_matrix[D2_48].mean(axis=0)
D2_24_gene_pc = pca_genes_matrix[D2_24].mean(axis=0)

D1_72_viability_pc = pca_viability_matrix[D1_72].mean(axis=0)
D1_48_viability_pc = pca_viability_matrix[D1_48].mean(axis=0)
D1_24_viability_pc = pca_viability_matrix[D1_24].mean(axis=0)
D2_72_viability_pc = pca_viability_matrix[D2_72].mean(axis=0)
D2_48_viability_pc = pca_viability_matrix[D2_48].mean(axis=0)
D2_24_viability_pc = pca_viability_matrix[D2_24].mean(axis=0)

In [None]:
def get_final_pcas(dose,time,control_gene_pc,control_viability_pc):
    indices = X_train_with_ctl[(X_train_with_ctl["cp_type"]!="ctl_vehicle") &\
                                  (X_train_with_ctl["cp_time"]==time) &\
                                  (X_train_with_ctl['cp_dose']==dose)].index
    final_gene_pcas=[]
    final_viability_pcas=[]
    for i in pca_genes_matrix[indices]:
        corrected_pca = i - control_gene_pc
        final_gene_pcas.append(corrected_pca/np.linalg.norm(corrected_pca))
    final_gene_pcas=pd.DataFrame(final_gene_pcas)
    for i in pca_viability_matrix[indices]:
        corrected_pca = i - control_viability_pc
        final_viability_pcas.append(corrected_pca/np.linalg.norm(corrected_pca))
    final_viability_pcas=pd.DataFrame(final_viability_pcas)
    final_pcas=pd.concat([final_gene_pcas,final_viability_pcas],axis=1)
    final_pcas['index']=indices
    
    return final_pcas
    

In [None]:
D1_72_final_pcas=get_final_pcas('D1',72,D1_72_gene_pc,D1_72_viability_pc)
D1_48_final_pcas=get_final_pcas('D1',48,D1_48_gene_pc,D1_48_viability_pc)
D1_24_final_pcas=get_final_pcas('D1',24,D1_24_gene_pc,D1_24_viability_pc)
D2_72_final_pcas=get_final_pcas('D2',72,D2_72_gene_pc,D2_72_viability_pc)
D2_48_final_pcas=get_final_pcas('D2',48,D2_48_gene_pc,D2_48_viability_pc)
D2_24_final_pcas=get_final_pcas('D2',24,D2_24_gene_pc,D2_24_viability_pc)

In [None]:
final_pcas = pd.concat([D1_72_final_pcas,D1_48_final_pcas,D1_24_final_pcas,D2_72_final_pcas,D2_48_final_pcas,D2_24_final_pcas],axis=0)

In [None]:
final_pcas.shape

In [None]:
X_train_with_ctl.head(8)

In [None]:
X_train_with_ctl.index

In [None]:
X_train_final = pd.merge(final_pcas,X_train_with_ctl,how="left",left_on='index',right_index=True)

In [None]:
X_train_final.shape

In [None]:
y_train.shape

In [None]:
X_train_final = X_train_final.iloc[:,0:103]

In [None]:
X_train_final.drop(['index_x','index_y'],inplace=True,axis=1)

In [None]:
X_train_y_final = pd.merge(X_train_final,y_train,how="inner",left_on="sig_id",right_on="sig_id")

In [None]:
X_train_y_final.iloc[:,0:103].head()

In [None]:
X_train_final = X_train_y_final.iloc[:,0:100]

In [None]:
y_train_final = X_train_y_final.iloc[:,101:]

In [None]:
y_train_final.head()

In [None]:
X_test_scaled = scaler.transform(X_test.iloc[:,4:])

In [None]:
X_test_pca_viability = pca_viability.transform(X_test_scaled[:,-100:])

In [None]:
X_test_pca_gene = pca_genes.transform(X_test_scaled[:,:-100])

In [None]:
X_test.reset_index(inplace=True)

In [None]:
X_test.head()

In [None]:
def get_final_pcas_test(dose,time,control_gene_pc,control_viability_pc):
    indices = X_test[(X_test["cp_type"]!="ctl_vehicle") & (X_test["cp_time"]==time) &\
                                  (X_test['cp_dose']==dose)].index
    
    final_gene_pcas=[]
    final_viability_pcas=[]
    for i in X_test_pca_gene[indices]:
        corrected_pca = i - control_gene_pc
        final_gene_pcas.append(corrected_pca/np.linalg.norm(corrected_pca))
    final_gene_pcas=pd.DataFrame(final_gene_pcas)
    for i in X_test_pca_viability[indices]:
        corrected_pca = i - control_viability_pc
        final_viability_pcas.append(corrected_pca/np.linalg.norm(corrected_pca))
    final_viability_pcas=pd.DataFrame(final_viability_pcas)
    final_pcas=pd.concat([final_gene_pcas,final_viability_pcas],axis=1)
    final_pcas['index']=indices
    
    return final_pcas
    

In [None]:
D1_72_final_pcas_test=get_final_pcas_test('D1',72,D1_72_gene_pc,D1_72_viability_pc)
D1_48_final_pcas_test=get_final_pcas_test('D1',48,D1_48_gene_pc,D1_48_viability_pc)
D1_24_final_pcas_test=get_final_pcas_test('D1',24,D1_24_gene_pc,D1_24_viability_pc)
D2_72_final_pcas_test=get_final_pcas_test('D2',72,D2_72_gene_pc,D2_72_viability_pc)
D2_48_final_pcas_test=get_final_pcas_test('D2',48,D2_48_gene_pc,D2_48_viability_pc)
D2_24_final_pcas_test=get_final_pcas_test('D2',24,D2_24_gene_pc,D2_24_viability_pc)

In [None]:
final_pcas_test = pd.concat([D1_72_final_pcas_test,D1_48_final_pcas_test,D1_24_final_pcas_test,D2_72_final_pcas_test,D2_48_final_pcas_test,D2_24_final_pcas_test],axis=0)

In [None]:
X_test_final = pd.merge(final_pcas_test,X_test,how="left",left_on='index',right_index=True)

In [None]:
X_test_final.drop(['index_x','index_y'],inplace=True,axis=1)

In [None]:
X_y_test_final = pd.merge(X_test_final,y_test,how="inner",left_on="sig_id",right_on="sig_id")

In [None]:
X_test_final = X_y_test_final.iloc[:,0:100]

In [None]:
y_test_final = X_y_test_final.iloc[:,101:]

In [None]:
test_treat_ids = X_y_test_final.iloc[100]

In [None]:
y_test_final = y_test_final.iloc[:,-206:]

In [None]:
X_train_final.shape

In [None]:
X_test_final.shape

In [None]:
weights = dict(y_train_final.sum(axis=0))

In [None]:
for i, k in zip(range(0,206),list(weights.keys())):
    weights[i]=weights[k]/y_train_final.sum(axis=0).min()
    del weights[k]

In [None]:
weights

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [None]:
model=Sequential()
model.add(Dense(units=100,
               activation="relu",
               use_bias=True,
               kernel_initializer="glorot_uniform"))
model.add(Dropout(0.1))
model.add(Dense(units=50,
               activation="relu",
               use_bias=True,
               kernel_initializer="glorot_uniform"))
model.add(Dropout(0.1))
model.add(Dense(units=100,
               activation="relu",
               use_bias=True,
               kernel_initializer="glorot_uniform"))
model.add(Dropout(0.1))
model.add(Dense(units=100,
               activation="relu",
               use_bias=True,
               kernel_initializer="glorot_uniform"))
model.add(Dropout(0.1))

model.add(Dense(units=206))
model.add(Activation('sigmoid')) #can I use softmax if there are some in multiple classes or no classes?

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
callback = EarlyStopping(monitor='loss', patience=10)

In [None]:
from keras.optimizers import Adam

In [None]:
optimizer = Adam(lr=0.000001) #model only seems to converge with very low learning rate

In [None]:
from keras.losses import KLDivergence

In [None]:
model.compile(loss=KLDivergence(), #is this appropriate for a classification problem where there are some falling in multiple classes?
              optimizer=optimizer, 
              metrics=['accuracy'])

In [None]:
#note.. how to avoid webpage reloading when using significant energy?

model.fit(X_train_final, y_train_final, epochs=300, batch_size=3,
              validation_data=(X_test_final, y_test_final),class_weight=weights,
         verbose=True, callbacks=[callback])

In [None]:
test_features = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")

In [None]:
test_features.head()

In [None]:
test_scaled = scaler.transform(test_features.iloc[:,4:])

In [None]:
test_pca_viability = pca_viability.transform(test_scaled[:,-100:])

In [None]:
test_pca_gene = pca_genes.transform(test_scaled[:,0:-100])

In [None]:
test_features.reset_index(inplace=True)

In [None]:
def get_final_pcas_final_test(dose,time,control_gene_pc,control_viability_pc):
    indices = test_features[(test_features["cp_type"]!="ctl_vehicle") & (test_features["cp_time"]==time) &\
                                  (test_features['cp_dose']==dose)].index
    
    final_gene_pcas=[]
    final_viability_pcas=[]
    for i in test_pca_gene[indices]:
        corrected_pca = i - control_gene_pc
        final_gene_pcas.append(corrected_pca/np.linalg.norm(corrected_pca))
    final_gene_pcas=pd.DataFrame(final_gene_pcas)
    for i in test_pca_viability[indices]:
        corrected_pca = i - control_viability_pc
        final_viability_pcas.append(corrected_pca/np.linalg.norm(corrected_pca))
    final_viability_pcas=pd.DataFrame(final_viability_pcas)
    final_pcas=pd.concat([final_gene_pcas,final_viability_pcas],axis=1)
    final_pcas['index']=indices
    
    return final_pcas
    

In [None]:
D1_72_final_pcas_val=get_final_pcas_final_test('D1',72,D1_72_gene_pc,D1_72_viability_pc)
D1_48_final_pcas_val=get_final_pcas_final_test('D1',48,D1_48_gene_pc,D1_48_viability_pc)
D1_24_final_pcas_val=get_final_pcas_final_test('D1',24,D1_24_gene_pc,D1_24_viability_pc)
D2_72_final_pcas_val=get_final_pcas_final_test('D2',72,D2_72_gene_pc,D2_72_viability_pc)
D2_48_final_pcas_val=get_final_pcas_final_test('D2',48,D2_48_gene_pc,D2_48_viability_pc)
D2_24_final_pcas_val=get_final_pcas_final_test('D2',24,D2_24_gene_pc,D2_24_viability_pc)

In [None]:
final_pcas_test = pd.concat([D1_72_final_pcas_val,D1_48_final_pcas_val,D1_24_final_pcas_val,D2_72_final_pcas_val,D2_48_final_pcas_val,D2_24_final_pcas_val],axis=0)

In [None]:
test_final = pd.merge(final_pcas_test,test_features,how="left",left_on='index',right_index=True)

In [None]:
test_final = test_final.iloc[:,0:100]

In [None]:
test_final.shape

In [None]:
preds = model.predict(test_final)

In [None]:
submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
x = pd.merge(final_pcas_test,test_features,how="left",left_on='index',right_index=True)

In [None]:
ids = x.iloc[:,102]

In [None]:
preds = pd.DataFrame(preds)

In [None]:
preds.columns = y_train_final.columns

In [None]:
preds.sum(axis=1).sort_values()

In [None]:
preds.shape

In [None]:
len(ids)

In [None]:
preds.reset_index(inplace=True)

In [None]:
preds = preds.reindex(columns=submission.columns)

In [None]:
ids=list(ids)

In [None]:
preds["sig_id"]=ids

In [None]:
preds.head()

In [None]:
preds.iloc[:,1:].apply(np.argmax,axis=1).value_counts()

In [None]:
preds.iloc[:,1:].apply(np.max,axis=1).sort_values()

In [None]:
test_features.iloc[:,1]

In [None]:
submission2 = pd.merge(test_features.iloc[:,1],preds,how="left",left_on="sig_id",right_on="sig_id")

In [None]:
submission2.head()

In [None]:
submission = submission2.fillna(0.0)

In [None]:
submission.to_csv('submission.csv')

In [None]:
submission.drop(['sig_id'],axis=1).apply(lambda x: x > 0.25, axis=0).sum(axis=0).sort_values()