In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
import random
random.seed(123)


In [None]:
train_features = pd.read_csv(f'../input/lish-moa/train_features.csv',index_col=0)
train_target_scored = pd.read_csv(f'../input/lish-moa/train_targets_scored.csv',)
train_target_nonscored = pd.read_csv(f'../input/lish-moa/train_targets_scored.csv')
train_drug = pd.read_csv(f'../input/lish-moa/train_features.csv')
test_features = pd.read_csv(f'../input/lish-moa/test_features.csv',index_col=0)
sample_sub = pd.read_csv(f'../input/lish-moa/sample_submission.csv')

In [None]:
train_features.head()

In [None]:
test_features.head()

In [None]:
train_target_scored.head()

In [None]:
train_target_nonscored.head()

In [None]:
print(train_features.shape)


In [None]:
print(train_target_scored.shape)


### data_analysis

In [None]:
train_gs = train_features.iloc[:,train_features.columns.map(lambda x: x[0:2])=='g-']
train_cs = train_features.iloc[:,train_features.columns.map(lambda x: x[0:2])=='c-']
print(train_gs.head())
print(train_cs.head())

print("Gene expression data Number of columns: "+str(train_gs.shape[1]))
print("cell viability data Number of columns: "+ str(train_cs.shape[1]))

We observe that the dataset consists of 772 columns for gene expression data and 100 columns for cell viability data. In addition cp_type indicate the type of treatment, whether the sample was treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle). Control perturbations have no MoAs. cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).

In [None]:
print("Mean:"+str(pd.concat([train_gs,train_cs],axis=1).values.mean()))
print("Std:"+str(pd.concat([train_gs,train_cs],axis=1).values.std()))
plt.figure(figsize=(5,5))
sns.distplot(pd.concat([train_gs,train_cs],axis=1).values)
plt.title('combined gene expression and cell viability')
plt.figure(figsize=(12,12))
plt.subplot(2,2,1)
sns.distplot(train_gs['g-0'],color='pink')
plt.title('g-0')
plt.subplot(2,2,2)
sns.distplot(train_gs['g-100'],color='pink')
plt.title('g-100')
plt.subplot(2,2,3)
sns.distplot(train_cs['c-1'],color='pink')
plt.title('c-0')
plt.subplot(2,2,4)
sns.distplot(train_cs['c-80'],color='pink')
plt.title('c-80')

It is observed that the all gene expression and cell viabiity columns fit a normal distribution. Also the cell viability data appears to be clipped at -10 and gene expression data at -10 and 10 respectively. This is evident from the peaks observed in the distribution graph

In [None]:
print("Gene expression data statistics: ")
print("  Mean: "+str(train_gs.values.mean()))
print("  Std: "+str(train_gs.values.std()))
print("  Max: "+str(train_gs.values.max()))
print("  Min: "+str(train_gs.values.min()))
print('\nCell viability data statistics: ')
print("  Mean: "+str(train_cs.values.mean()))
print("  Std: "+str(train_cs.values.std()))
print("  Max: "+str(train_cs.values.max()))
print("  Min: "+str(train_cs.values.min()))

In [None]:
plt.figure(figsize=(5,12))
plt.subplot(3,1,1)
splot = sns.countplot(train_features["cp_type"])

for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.title('cp_type')
plt.subplot(3,1,2)
sns.countplot(train_features['cp_time'],hue=train_features['cp_type'])
plt.title('cp_time vs cp_type')
plt.subplot(3,1,3)
sns.countplot(train_features['cp_dose'],hue=train_features['cp_type'])
plt.title('cp_dose vs cp_type')

# target_data_analysis

In [None]:
print("Number of scored targets: "+str(train_target_scored.shape[1]))


39.3% of the training samples have 0 MoA activation. A majority of samples(52.6%) have one MoA activation. At max a sample has 7 MoA activations.

In [None]:
out = dict()
arr=train_target_scored.drop('sig_id',axis=1).values==1
for a in range(len(arr)):
    o=np.sum(arr[a])
    if o not in out.keys():
        out[o]=1
    else:
        out[o]+=1
length = 23814
plt.figure(figsize=(7,7))
splot = sns.barplot(x=list(out.keys()),y=list(out.values()))
for p in splot.patches:
    splot.annotate(format(p.get_height()*100/length, '.1f')+'%', 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.xlabel('Number of MoAs in sample')
plt.ylabel('Count')
plt.title('Percentage of samples with MoA counts')

In [None]:
cor = train_target_scored.drop('sig_id',axis=1).corr()
plt.figure(figsize=(10,10))
sns.heatmap(cor)

It is observed that most drugs are not at all correlated with the exception of a few pairs


In [None]:
df = pd.DataFrame(columns=['drug_a','drug_b','corr'])
for j in range(len(cor)):
    for i in range(len(cor)):
        if cor.iloc[i,j]>=0.7 and cor.iloc[i,j]!=1.0:
            df = pd.concat([df,pd.DataFrame({'drug_a':[cor.columns[j]],'drug_b':[cor.columns[i]],'corr':[cor.iloc[i,j]]})],axis=0)
df

###Feature Relationships

Looking at the plot for cell viability data of a sample, it does not appear to be completely random and thus might have interdependence. Thus sequential models may be used to process the cell viability data.

Note: The above conclusion is purely based on speculation. A verification from someone with domain knowledge is appreciated

In the case of the gene expression profile, in the work published in American Society for Microbiology journal, Hutter et. al. conclude that after treatment with a particular sample, a linear svm may be used to classify the MoA based on the gene expression profile data. A linear svm per class of MoA was used in their work. In our case however the number of scored MoA classes are large. Therfore perhaps a deep neural net might perform well in predicting the MoA classes.

In [None]:
genes = [col for col in train_features if col.startswith('g-')]
cells = [col for col in train_features if col.startswith('c-')]

# Genes correlation

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_features.loc[:,genes].corr(),cmap='viridis')

# Cell correlations

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_features.loc[:,cells].corr(),cmap='viridis')

Many cell viabilities have high correlation with each other

In [None]:
plt.plot(train_cs.iloc[1,:])
plt.title('cell viability data for second sample')

In [None]:
cor = train_gs.corr()

It is also observed that certain gene pairs have high negative correlation


In [None]:
df = pd.DataFrame(columns=['gene_a','gene_b','corr'])
for j in range(len(cor)):
    for i in range(len(cor)):
        if cor.iloc[i,j]<=-0.8 and cor.iloc[i,j]!=1.0:
            df = pd.concat([df,pd.DataFrame({'gene_a':[cor.columns[j]],'gene_b':[cor.columns[i]],'corr':[cor.iloc[i,j]]})],axis=0)
df

Some gene expressions coloured with respect to treatment type: cyan represents trt-vehicle and orange trt_cp

In [None]:
cp_1 = train_features[train_features['cp_type']=='trt_cp']
cp_2 = train_features[train_features['cp_type']!='trt_cp']
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.distplot(cp_1['g-0'],color='orange',hist=False)
sns.distplot(cp_2['g-0'],color='cyan',hist=False)
plt.title('g-0')
plt.subplot(2,2,2)
sns.distplot(cp_1['g-100'],color='orange',hist=False)
sns.distplot(cp_2['g-100'],color='cyan',hist=False)
plt.title('g-100')
plt.subplot(2,2,3)
sns.distplot(cp_1['g-500'],color='orange',hist=False)
sns.distplot(cp_2['g-500'],color='cyan',hist=False)
plt.title('g-500')
plt.subplot(2,2,4)
sns.distplot(cp_1['g-600'],color='orange',hist=False)
sns.distplot(cp_2['g-600'],color='cyan',hist=False)
plt.title('g-600')

Cell viability for some samples coloured with respect to treatment type: cyan represents trt_vehicle and orange trt_cp

In [None]:
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.distplot(cp_1['c-1'],color='orange',hist=False)
sns.distplot(cp_2['c-1'],color='cyan',hist=False)
plt.title('c-1')
plt.subplot(2,2,2)
sns.distplot(cp_1['c-20'],color='orange',hist=False)
sns.distplot(cp_2['c-20'],color='cyan',hist=False)
plt.title('c-20')
plt.subplot(2,2,3)
sns.distplot(cp_1['c-40'],color='orange',hist=False)
sns.distplot(cp_2['c-40'],color='cyan',hist=False)
plt.title('c-40')
plt.subplot(2,2,4)
sns.distplot(cp_1['c-50'],color='orange',hist=False)
sns.distplot(cp_2['c-50'],color='cyan',hist=False)
plt.title('c-50')

For gene expression with respect to treatment type the distributions for gene expression have slighty more variance compared to the same plot for cell viability.

Some gene expressions coloured with respect to treatment duration: cyan represents 24, orange 48 and blue 72

In [None]:
cp_1 = train_features[train_features['cp_time']==24]
cp_2 = train_features[train_features['cp_time']!=48]
cp_3 = train_features[train_features['cp_time']!=72]
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.distplot(cp_1['g-0'],color='orange',hist=False)
sns.distplot(cp_2['g-0'],color='green',hist=False)
sns.distplot(cp_3['g-0'],color='blue',hist=False)
plt.title('g-0')
plt.subplot(2,2,2)
sns.distplot(cp_1['g-100'],color='orange',hist=False)
sns.distplot(cp_2['g-100'],color='green',hist=False)
sns.distplot(cp_3['g-100'],color='blue',hist=False)
plt.title('g-100')
plt.subplot(2,2,3)
sns.distplot(cp_1['g-500'],color='orange',hist=False)
sns.distplot(cp_2['g-500'],color='green',hist=False)
sns.distplot(cp_3['g-500'],color='blue',hist=False)
plt.title('g-500')
plt.subplot(2,2,4)
sns.distplot(cp_1['g-600'],color='orange',hist=False)
sns.distplot(cp_2['g-600'],color='green',hist=False)
sns.distplot(cp_3['g-600'],color='blue',hist=False)
plt.title('g-600')

cell viability of some samples coloured with respect to treatment duration: cyan represents 24, orange 48 and blue 72

In [None]:
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.distplot(cp_1['c-1'],color='orange',hist=False)
sns.distplot(cp_2['c-1'],color='cyan',hist=False)
sns.distplot(cp_3['c-1'],color='blue',hist=False)
plt.title('c-1')
plt.subplot(2,2,2)
sns.distplot(cp_1['c-20'],color='orange',hist=False)
sns.distplot(cp_2['c-20'],color='cyan',hist=False)
sns.distplot(cp_3['c-20'],color='blue',hist=False)
plt.title('c-20')
plt.subplot(2,2,3)
sns.distplot(cp_1['c-40'],color='orange',hist=False)
sns.distplot(cp_2['c-40'],color='cyan',hist=False)
sns.distplot(cp_3['c-40'],color='blue',hist=False)
plt.title('c-40')
plt.subplot(2,2,4)
sns.distplot(cp_1['c-50'],color='orange',hist=False)
sns.distplot(cp_2['c-50'],color='cyan',hist=False)
sns.distplot(cp_3['c-50'],color='blue',hist=False)
plt.title('c-50')

cell viability and gene expression of some samples coloured with respect to dose: cyan represents D0 and orange D1

In [None]:
cp_1 = train_features[train_features['cp_dose']=='D0']
cp_2 = train_features[train_features['cp_dose']=='D1']
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.distplot(cp_1['g-0'],color='orange',hist=False)
sns.distplot(cp_2['g-0'],color='cyan',hist=False)
plt.title('g-0')
plt.subplot(2,2,2)
sns.distplot(cp_1['g-100'],color='orange',hist=False)
sns.distplot(cp_2['g-100'],color='cyan',hist=False)
plt.title('g-100')
plt.subplot(2,2,3)
sns.distplot(cp_1['g-500'],color='orange',hist=False)
sns.distplot(cp_2['g-500'],color='cyan',hist=False)
plt.title('g-500')
plt.subplot(2,2,4)
sns.distplot(cp_1['g-600'],color='orange',hist=False)
sns.distplot(cp_2['g-600'],color='cyan',hist=False)
plt.title('g-600')

In [None]:
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.distplot(cp_1['c-1'],color='orange',hist=False)
sns.distplot(cp_2['c-1'],color='cyan',hist=False)
plt.title('c-1')
plt.subplot(2,2,2)
sns.distplot(cp_1['c-20'],color='orange',hist=False)
sns.distplot(cp_2['c-20'],color='cyan',hist=False)
plt.title('c-20')
plt.subplot(2,2,3)
sns.distplot(cp_1['c-40'],color='orange',hist=False)
sns.distplot(cp_2['c-40'],color='cyan',hist=False)
plt.title('c-40')
plt.subplot(2,2,4)
sns.distplot(cp_1['c-50'],color='orange',hist=False)
sns.distplot(cp_2['c-50'],color='cyan',hist=False)
plt.title('c-50')

# data_preprocessing

In [None]:
train_idxs, val_idxs = train_test_split([i for i in range(0, 23814)], test_size=0.2)
len(train_idxs), len(val_idxs)

In [None]:
train_idxs

In [None]:
val_idxs

In [None]:
tra_features = train_features.iloc[train_idxs, :]
val_features = train_features.iloc[val_idxs, :]

tra_labels = train_target_scored.iloc[train_idxs, :]
val_labels = train_target_scored.iloc[val_idxs, :]

len(tra_features), len(val_features), len(tra_labels), len(val_labels)

In [None]:
# handling cp_type
cp_type_dict = {
    "trt_cp": 0,
    "ctl_vehicle": 1
}

train_features["cp_type"] = train_features.cp_type.map(cp_type_dict)
val_features["cp_type"] = val_features.cp_type.map(cp_type_dict)

In [None]:
# handling cp_dose
cp_dose_dict = {
    "D1": 1,
    "D2": 2
}

train_features["cp_dose"] = train_features.cp_dose.map(cp_dose_dict)
val_features["cp_dose"] = val_features.cp_dose.map(cp_dose_dict)

In [None]:
# handling cp_time
cp_time_dict = {
    24: 1,
    48: 2,
    72: 3
}

train_features["cp_time"] = train_features.cp_time.map(cp_time_dict)
val_features["cp_time"] = val_features.cp_time.map(cp_time_dict)
train_features
val_features

# ML Model

In [None]:
continuous_columns = [col for col in list(train_features.columns) if col not in ["cp_type", "cp_dose", "cp_time", "sig_id"]]
print("Number of continuous columns are {}".format(len(continuous_columns)))

train_continuous_columns_df = tra_features[continuous_columns].copy()

standard_scaler_object = StandardScaler().fit(train_continuous_columns_df.values)

train_continuous_columns_df = standard_scaler_object.transform(train_continuous_columns_df.values)

val_continuous_columns_df = val_features[continuous_columns].copy()
val_continuous_columns_df = standard_scaler_object.transform(val_continuous_columns_df.values)

# assigning scaled values to original data
tra_features[continuous_columns] = train_continuous_columns_df
val_features[continuous_columns] = val_continuous_columns_df

In [None]:
all_categories = list(tra_labels.columns)
len(all_categories)

In [None]:
missing_tra = (tra_features.isnull().sum())
missing_tra

In [None]:
missing_val = (val_features.isnull().sum())
missing_val

In [None]:
tra_without_missing_values = tra_features.dropna(axis=1)
tra_without_missing_values 

In [None]:
val_without_missing_values = val_features.dropna(axis=1)
val_without_missing_values 

In [None]:
 model_dict={}

In [None]:
for category in tqdm(all_categories):
    # Training logistic regression model on train data
    logistic_model = LogisticRegression(max_iter=5000)
    logistic_model.fit(train_features, tra_labels[category])
    
    # saving model
    model_dict[category] = logistic_model 

In [None]:
def calculate_score(models_dict, val_features, val_labels, all_categories):
    log_loss_per_category = []
    for category in tqdm(all_categories):
        # predicting using logistic regression model from the models_dict
        logistic_model = models_dict[category]
        category_probabs = logistic_model.predict_proba(val_features)
        
        log_loss_per_category.append(
            log_loss(val_labels[category], category_probabs, labels=[0, 1])
        )
    
    return float(sum(log_loss_per_category)) / len(log_loss_per_category)

In [None]:
val_score = calculate_score(model_dict, val_features, val_labels, all_categories)
print("Validation score on validation set is {}".format(val_score))

In [None]:
test_features["cp_type"] = test_features.cp_type.map(cp_type_dict)
test_features["cp_dose"] = test_features.cp_dose.map(cp_dose_dict)
test_features["cp_time"] = test_features.cp_time.map(cp_time_dict)

In [None]:
test_continuous_columns_df = test_features[continuous_columns].copy()
test_continuous_columns_df = standard_scaler_object.transform(test_continuous_columns_df.values)
test_features[continuous_columns] = test_continuous_columns_df

In [None]:
test_features_df = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")
print(test_features_df.shape)
test_features_df.head()

In [None]:
predictions_df = pd.DataFrame()

predictions_df["sig_id"] = test_features_df.sig_id
for category in tqdm(all_categories):
    predictions_df[category] = model_dict[category].predict_proba(test_features_df.iloc[:, 1:])[:,1 :]

In [None]:
predictions_df = predictions_df.round(1)
predictions_df

In [None]:
predictions_df.shape


In [None]:
all_ctl_test_ids = list(test_features[test_features.cp_type == 1].sig_id)
print(len(all_ctl_test_ids))
for id_ in tqdm(all_ctl_test_ids):
    predictions_df.loc[predictions_df.sig_id == id_, all_categories] = 0.0

In [None]:
predictions_df.head()


In [None]:
pd.read_csv("../input/sample_submission.csv").to_csv("the_solution.csv")
