Based on discussions [here](https://www.kaggle.com/c/lish-moa/discussion/190772) we know that each experiment has 6 sig_ids (2 cp_dose * 3 cp_time).

If we check the target columns we can look for targets that only activate for a given experiment, therefore they would only activate 6 times and must have equal number of cp_dose and cp_time across all targets

# File Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px # plotting
from sklearn.decomposition import PCA # Principal Component Analysis

In [None]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

# Extracting Targets with Unique Experiments

In [None]:
targets[targets.columns[1:]].sum().sort_values()[:20]

In [None]:
known_experiments = ['diuretic',
'autotaxin_inhibitor',                           
'protein_phosphatase_inhibitor',                 
'antiarrhythmic',                                
'retinoid_receptor_antagonist',                  
'nicotinic_receptor_agonist',                    
'atm_kinase_inhibitor',                          
'calcineurin_inhibitor',                         
'lxr_agonist',                                   
'elastase_inhibitor',                            
'steroid',                                       
'leukotriene_inhibitor',                         
'coagulation_factor_inhibitor',                  
'ubiquitin_specific_protease_inhibitor',         
'tropomyosin_receptor_kinase_inhibitor',         
'laxative']

In [None]:
len(known_experiments)

# Preprocessing

In [None]:
"""
Code by @namanj27 from CatBoost MoA [EDA | Starter] 

https://www.kaggle.com/namanj27/catboost-moa-eda-starter
"""

train_n = pd.merge(train, targets, on='sig_id')

X_train = []
X_train_columns = train_n.columns

for v in train_n.values:
    info = v[:876]
    binary = v[876:]
    index = [k for k, i in enumerate(binary) if i==1]
    
    for i in index:
        for k in range(len(binary)):
            if k==i:
                X_train.append(list(info) + [X_train_columns[876+k]])

X_train = pd.DataFrame(X_train, columns=train.columns.tolist() + ['pred'])

In [None]:
X_train

In [None]:
X_train['Known_Experiment'] = 'No'

In [None]:
# If its in known experiment then add the target to 'experiment'
for i, row in X_train.iterrows():
    if row['pred'] in known_experiments:
        X_train.loc[i, 'Known_Experiment'] = row['pred']

In [None]:
only_known_experiments = X_train[X_train['pred'].isin(known_experiments)]
only_known_experiments

# Visualizations

In [None]:
# Each of the targets have 6 unique ids that are the product of the cp_time, cp_dose and cp_type
only_known_experiments.groupby('pred').nunique()[['sig_id', 'cp_type', 'cp_time', 'cp_dose']]

In [None]:
X_train

In [None]:
pca = PCA(n_components=50)
results = pca.fit_transform(X_train[X_train.columns[4:-2]])

In [None]:
fig = px.scatter_3d(x=results[:, 0],
                    y=results[:, 1],
                    z=results[:, 2],
                    opacity=0.4,
                    title="PCA Plot of Known Experiment Targets and All",
                    color=X_train['Known_Experiment'])
fig.show()

In [None]:
only_exp_idx = X_train.index[X_train['Known_Experiment'] != "No"]

In [None]:
fig = px.scatter_3d(x=results[only_exp_idx, 0],
                    y=results[only_exp_idx, 1],
                    z=results[only_exp_idx, 2],
                    opacity=0.8,
                    title="PCA Plot of Known Experiment Targets Only",
                    color=X_train.loc[only_exp_idx, 'Known_Experiment'])
fig.show()

Clustering 6 sig_id's at a time using K-means or other algorithms may not be so useful

# Examining sig_id

In [None]:
sig_id_testing = only_known_experiments[['sig_id', 'cp_type', 'cp_time', 'cp_dose', 'pred']].reset_index()
del sig_id_testing['index']
sig_id_testing

In [None]:
sig_id_testing['sig_id'] = sig_id_testing['sig_id'].apply(lambda x: x[3:]) # Removing id_

In [None]:
letter_cols = ["Letter "+str(i+1) for i in range(9)]
letter_cols

In [None]:
individual_letters = sig_id_testing.sig_id.str.split("",expand=True)
del individual_letters[0], individual_letters[10] # Remove spaces
individual_letters.columns = letter_cols
individual_letters

In [None]:
# Combination of 2 letters (forward pass)
for i in range(8):
    individual_letters['Letter '+str(i+1)+'+'+'Letter '+str(i+2)] = individual_letters['Letter '+str(i+1)] + individual_letters['Letter '+str(i+2)] 

In [None]:
# Combination of 3 letters (forward pass)
for i in range(7):
    individual_letters['Letter '+str(i+1)+'+'+'Letter '+str(i+2)+'+'+'Letter '+str(i+3)] = individual_letters['Letter '+str(i+1)] + individual_letters['Letter '+str(i+2)] + individual_letters['Letter '+str(i+3)] 

In [None]:
# Combination of 4 letters (forward pass)
for i in range(6):
    individual_letters['Letter '+str(i+1)+'+'+'Letter '+str(i+2)+'+'+'Letter '+str(i+3)+'+'+'Letter '+str(i+4)] = individual_letters['Letter '+str(i+1)] + individual_letters['Letter '+str(i+2)] + individual_letters['Letter '+str(i+3)] + individual_letters['Letter '+str(i+4)] 

In [None]:
individual_letters

In [None]:
sig_id_testing = pd.concat([sig_id_testing, individual_letters], axis=1).reset_index()

In [None]:
del sig_id_testing['index']
sig_id_testing

In [None]:
categorical_ = sig_id_testing.copy()

for column in categorical_.columns[1:]:
    categorical_[column] = categorical_[column].astype('category').cat.codes

In [None]:
categorical_

In [None]:
corr = categorical_[categorical_.columns[2:]].corr()
corr[['cp_time', 'cp_dose', 'pred']].style.background_gradient(cmap='coolwarm').set_precision(2)

Letter 4 and prediction show the most promising correlation

In [None]:
# There are 4/5/6 unique letters for each prediction so this doesnt really help isolate anything
# Using the sig_id to cluster 6 at a time may not help either

sig_id_testing[['Letter 4', 'pred']].groupby('pred').agg(['nunique'])

I was unable to find a link between targets/sig_id to get experiment_id - maybe you will have better luck