# Essential Libraries

In [None]:
# Importing essential libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

# Start time tracking for the notebook execution
t0start = time.time()

# List available data files
data_files = [os.path.join(dirname, filename) for dirname, _, filenames in os.walk('/kaggle/input') for filename in filenames]
data_files


# Training Data

In [None]:
# Loading the training dataset
file_path = '/kaggle/input/open-problems-single-cell-perturbations/de_train.parquet'
df_de_train = pd.read_parquet(file_path)
df_de_train.shape, df_de_train.head()


In [None]:
# Previewing the initial columns of the training dataset
df_de_train.iloc[:, 5:].head(1)


# Dimensional reductions (pca, umap,...), visualizations, clustering train target data

In [None]:
X = df_de_train.iloc[:,5:]
print(X.shape)

In [None]:
%%time
from sklearn.decomposition import PCA

v1_color = df_de_train[  'cell_type']
v2_color = df_de_train[  'sm_name'].copy()
v3_color = df_de_train[  'sm_name'].copy()
l = [t for t in df_de_train[  'sm_name'] if t.endswith('nib') ]
m = v2_color.isin( l)
v2_color[~m] = 'non -nib'
v3_color[m] = '*nib'

v4_color = df_de_train[  'control']#.copy()

list_top_drugs = ['MLN 2238', 'Resminostat', 'CEP-18770 (Delanzomib)', 'Oprozomib (ONX 0912)', 'Belinostat', 'Vorinostat', 'Ganetespib (STA-9090)', 'Scriptaid', 'Proscillaridin A;Proscillaridin-A', 'Alvocidib', 'IN1451']
m = df_de_train[  'sm_name'].isin( list_top_drugs)
v5_color = df_de_train[  'sm_name'].copy()
v5_color[~m] = np.nan



list_cfg = [ ['cell type',v1_color], ['control' , v4_color ] , ['top compounds',v5_color] ]
#     str_inf1 = ''
    #X = np.clip(df.iloc[N0:N1,33:137].fillna(0),0, 1)
str_inf = 'PCA' 
reducer = PCA(n_components=100 )
Xr = reducer.fit_transform(X)
for i,j in [[0,1],[0,2],[1,2],[3,4],[5,6],[7,8]]:
    plt.figure(figsize = (20,10)); ic=0
    for str_inf1, v_for_color in list_cfg: # , ['*nib compounds ',v2_color], ['non *nib compounds',v3_color ] ]:
        ic+=1; plt.subplot(1,len(list_cfg),ic)
        sns.scatterplot(x= Xr[:,i], y = Xr[:,j], hue =  v_for_color ,s = 100) # df['reads'])
        plt.xlabel(str_inf+str(i+1), fontsize = 20)
        plt.ylabel(str_inf+str(j+1), fontsize = 20)
        plt.title(str_inf1 + ' ', fontsize = 20 )

    plt.show()

In [None]:
d = df_de_train.iloc[:,:5]
d['PCA1'] = Xr[:,0]
d['PCA2'] = Xr[:,1]
d['PCA3'] = Xr[:,2]
list_top_drugs = []
display( d.sort_values('PCA1', ascending = False ).head(8) )
list_top_drugs += d.sort_values('PCA1', ascending = False ).head(8)['sm_name'].to_list()
print(list_top_drugs)
display( d.sort_values('PCA2', ascending = False ).head(8) )
list_top_drugs += d.sort_values('PCA2', ascending = False ).head(8)['sm_name'].to_list()
display( d.sort_values('PCA3', ascending = False ).head(8) )
list_top_drugs += d.sort_values('PCA3', ascending = False ).head(8)['sm_name'].to_list()
print(list(set(list_top_drugs)))

## Clustering  Cell Types

In [None]:
%%time
N = df_de_train.shape[1]# 5000
print(N)
X = df_de_train[ ['cell_type'] + list(df_de_train.columns[5:N]) ].groupby('cell_type').median()
print(X.shape)
cm = np.corrcoef(X)
print(cm[:3,:2])
cm = np.abs(cm)
l = list(X.index)# [df_de_train['sm_name'].iat[i] +' '+ df_de_train['cell_type'].iat[i]  for i in range(len(df_de_train))] # .columns[5:N]
cm = pd.DataFrame(cm, index =l , columns = l )
print(cm.shape)
sns.clustermap(cm,  annot=True, fmt=".2f", cmap="coolwarm" )
plt.show()

## Clustering compounds

In [None]:
%%time
N = df_de_train.shape[1]# 5000
print(N)
X = df_de_train[ ['sm_name'] + list(df_de_train.columns[5:N]) ].groupby('sm_name').median()
print(X.shape)
cm = np.corrcoef(X)
print(cm[:3,:2])
cm = np.abs(cm)
l = list(X.index)# [df_de_train['sm_name'].iat[i] +' '+ df_de_train['cell_type'].iat[i]  for i in range(len(df_de_train))] # .columns[5:N]
l = [t[:20] for t in l] # cut long names
cm = pd.DataFrame(cm, index =l , columns = l )
print(cm.shape)
clustergrid = sns.clustermap(cm,cmap="coolwarm" )# ,  annot=True, fmt=".2f", 
plt.show()
reordered_columns = clustergrid.dendrogram_col.reordered_ind
reordered_rows = clustergrid.dendrogram_row.reordered_ind
print(len(reordered_rows), len(reordered_columns) )
print( list(cm.index[reordered_rows]) )
# print( list(X.columns[reordered_columns]) )

sns.clustermap(cm,  annot=True, fmt=".2f", cmap="coolwarm" )
plt.show()

## Clustering samples (i.e. pairs cell + compound)

In [None]:
%%time
N = df_de_train.shape[1]# 5000
X = df_de_train.iloc[:,5:N]
print(X.shape)
cm = np.corrcoef(X)
print(cm[:3,:2])
cm = np.abs(cm)
l = [df_de_train['sm_name'].iat[i] +' '+ df_de_train['cell_type'].iat[i]  for i in range(len(df_de_train))] # .columns[5:N]
cm = pd.DataFrame(cm, index =l , columns = l )
print(cm.shape)
clustergrid = sns.clustermap(cm)
plt.show()
reordered_columns = clustergrid.dendrogram_col.reordered_ind
reordered_rows = clustergrid.dendrogram_row.reordered_ind
print(len(reordered_rows), len(reordered_columns) )
print( list(cm.index[reordered_rows]) )
# print( list(cm.columns[reordered_columns]) )

## Look in genes space

In [None]:
%%time
from sklearn.decomposition import PCA

X = df_de_train.iloc[:,5:].T
print(X.shape)

v1_color = pd.Series(range(len(X)), name = 'index') # df_de_train[  'cell_type']



list_cfg = [ ['Genes',v1_color]]# , ['control' , v4_color ] , ['top compounds',v5_color] ]
#     str_inf1 = ''
    #X = np.clip(df.iloc[N0:N1,33:137].fillna(0),0, 1)
str_inf = 'PCA' 
reducer = PCA(n_components=10 )
Xr = reducer.fit_transform(X)
for i,j in [[0,1],[0,2],[1,2],[3,4],[5,6],[7,8]]:
    plt.figure(figsize = (20,10)); ic=0
    for str_inf1, v_for_color in list_cfg: # , ['*nib compounds ',v2_color], ['non *nib compounds',v3_color ] ]:
        ic+=1; plt.subplot(1,len(list_cfg),ic)
        sns.scatterplot(x= Xr[:,i], y = Xr[:,j], hue =  v_for_color ,s = 100) # df['reads'])
        plt.xlabel(str_inf+str(i+1), fontsize = 20)
        plt.ylabel(str_inf+str(j+1), fontsize = 20)
        plt.title(str_inf1 + ' ', fontsize = 20 )

    plt.show()

In [None]:
%%time
from sklearn.decomposition import PCA
import umap 

X = df_de_train.iloc[:,5:].T
print(X.shape)

v1_color = pd.Series(range(len(X)), name = 'index') # df_de_train[  'cell_type']



list_cfg = [ ['Genes',v1_color]]# , ['control' , v4_color ] , ['top compounds',v5_color] ]
# str_inf = 'PCA' 
# reducer = PCA(n_components=10 )
str_inf = 'UMAP' 
reducer = umap.UMAP()# (n_components=10 )

Xr = reducer.fit_transform(X)
for i,j in [[0,1] ]:# ,[0,2],[1,2],[3,4],[5,6],[7,8]]:
    plt.figure(figsize = (20,10)); ic=0
    for str_inf1, v_for_color in list_cfg: # , ['*nib compounds ',v2_color], ['non *nib compounds',v3_color ] ]:
        ic+=1; plt.subplot(1,len(list_cfg),ic)
        sns.scatterplot(x= Xr[:,i], y = Xr[:,j], hue =  v_for_color ,s = 100) # df['reads'])
        plt.xlabel(str_inf+str(i+1), fontsize = 20)
        plt.ylabel(str_inf+str(j+1), fontsize = 20)
        plt.title(str_inf1 + ' ', fontsize = 20 )

    plt.show()

## Genes clustering

In [None]:
%%time
N = 1000#  18211 #  15_000# 10000 #
X = df_de_train.iloc[:,5:N].T
print(X.shape)
cm = np.corrcoef(X)
print(cm[:3,:2])
cm = np.abs(cm)
cm = pd.DataFrame(cm, index = df_de_train.columns[5:N], columns = df_de_train.columns[5:N] )
print(cm.shape)
sns.clustermap(cm)
plt.show()

In [None]:
G1S_genes_Tirosh = ['MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8']
G2M_genes_Tirosh = ['HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1', 'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2', 'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA']
genes_Tirosh = G1S_genes_Tirosh + G2M_genes_Tirosh

# Subset of Tirosh genes to capture "fast" cell cycle pattern = see https://arxiv.org/abs/2208.05229
list_genes_fastCCsign = ['CDK1', 'UBE2C', 'TOP2A', 'TMPO', 'HJURP', 'RRM1', 'RAD51AP1', 'RRM2', 'CDC45', 'BLM', 'BRIP1', 'E2F8', 'HIST2H2AC']

G1S_genes_Freeman = ['ADAMTS1', 'ASF1B', 'ATAD2', 'BARD1', 'BLM', 'BRCA1', 'BRIP1', 'C17orf75', 'C9orf40', 'CACYBP', 'CASP8AP2', 'CCDC15', 'CCNE1', 'CCNE2', 'CCP110', 'CDC25A', 'CDC45', 'CDC6', 'CDC7', 'CDK2', 'CDT1', 'CENPJ', 'CENPQ', 'CENPU', 'CEP57', 'CHAF1A', 'CHAF1B', 'CHEK1', 'CLSPN', 'CREBZF', 'CRYL1', 'CSE1L', 'DCLRE1B', 'DCTPP1', 'DEK', 'DERA', 'DHFR', 'DNA2', 'DNAJC9', 'DNMT1', 'DONSON', 'DSCC1', 'DSN1', 'DTL', 'E2F8', 'EED', 'EFCAB11', 'ENDOD1', 'ETAA1', 'EXO1', 'EYA2', 'EZH2', 'FAM111A', 'FANCE', 'FANCG', 'FANCI', 'FANCL', 'FBXO5', 'FEN1', 'GGH', 'GINS1', 'GINS2', 'GINS3', 'GLMN', 'GMNN', 'GMPS', 'GPD2', 'HADH', 'HELLS', 'HSF2', 'ITGB3BP', 'KIAA0101', 'KNTC1', 'LIG1', 'MCM10', 'MCM2', 'MCM3', 'MCM4', 'MCM5', 'MCM6', 'MCM7', 'MCMBP', 'METTL9', 'MMD', 'MNS1', 'MPP1', 'MRE11A', 'MSH2', 'MSH6', 'MYO19', 'NASP', 'NPAT', 'NSMCE4A', 'ORC1', 'OSGEPL1', 'PAK1', 'PAQR4', 'PARP2', 'PASK', 'PAXIP1', 'PBX3', 'PCNA', 'PKMYT1', 'PMS1', 'POLA1', 'POLA2', 'POLD3', 'POLE2', 'PRIM1', 'PRPS2', 'PSMC3IP', 'RAB23', 'RAD51', 'RAD51AP1', 'RAD54L', 'RBBP8', 'RBL1', 'RDX', 'RFC2', 'RFC3', 'RFC4', 'RMI1', 'RNASEH2A', 'RPA1', 'RRM1', 'RRM2', 'SLBP', 'SLC25A40', 'SMC2', 'SMC3', 'SSX2IP', 'SUPT16H', 'TEX30', 'TFDP1', 'THAP10', 'THEM6', 'TIMELESS', 'TIPIN', 'TMEM106C', 'TMEM38B', 'TRIM45', 'TRIP13', 'TSPYL4', 'TTI1', 'TUBGCP5', 'TYMS', 'UBR7', 'UNG', 'USP1', 'WDHD1', 'WDR76', 'WRB', 'YEATS4', 'ZBTB14', 'ZWINT']
G2M_genes_Freeman = ['ADGRE5', 'ARHGAP11A', 'ARHGDIB', 'ARL6IP1', 'ASPM', 'AURKA', 'AURKB', 'BIRC5', 'BORA', 'BRD8', 'BUB1', 'BUB1B', 'BUB3', 'CCNA2', 'CCNB1', 'CCNB2', 'CCNF', 'CDC20', 'CDC25B', 'CDC25C', 'CDC27', 'CDCA3', 'CDCA8', 'CDK1', 'CDKN1B', 'CDKN3', 'CENPE', 'CENPF', 'CENPI', 'CENPN', 'CEP55', 'CEP70', 'CEP85', 'CKAP2', 'CKAP5', 'CKS1B', 'CKS2', 'CTCF', 'DBF4', 'DBF4B', 'DCAF7', 'DEPDC1', 'DLGAP5', 'ECT2', 'ERCC6L', 'ESPL1', 'FAM64A', 'FOXM1', 'FZD2', 'FZD7', 'FZR1', 'GPSM2', 'GTF2E1', 'GTSE1', 'H2AFX', 'HJURP', 'HMGB2', 'HMGB3', 'HMMR', 'HN1', 'INCENP', 'JADE2', 'KIF11', 'KIF14', 'KIF15', 'KIF18A', 'KIF18B', 'KIF20A', 'KIF20B', 'KIF22', 'KIF23', 'KIF2C', 'KIF4A', 'KIF5B', 'KIFC1', 'KPNA2', 'LBR', 'LMNB2', 'MAD2L1', 'MELK', 'MET', 'METTL4', 'MIS18BP1', 'MKI67', 'MPHOSPH9', 'MTMR6', 'NCAPD2', 'NCAPG', 'NCAPG2', 'NCAPH', 'NDC1', 'NDC80', 'NDE1', 'NEIL3', 'NEK2', 'NRF1', 'NUSAP1', 'OIP5', 'PAFAH2', 'PARPBP', 'PBK', 'PLEKHG3', 'PLK1', 'PLK4', 'PRC1', 'PRR11', 'PSRC1', 'PTTG1', 'PTTG3P', 'RACGAP1', 'RAD21', 'RASSF1', 'REEP4', 'SAP30', 'SHCBP1', 'SKA1', 'SLCO1B3', 'SOGA1', 'SPA17', 'SPAG5', 'SPC25', 'SPDL1', 'STIL', 'STK17B', 'TACC3', 'TAF5', 'TBC1D2', 'TBC1D31', 'TMPO', 'TOP2A', 'TPX2', 'TROAP', 'TTF2', 'TTK', 'TUBB4B', 'TUBD1', 'UBE2C', 'UBE2S', 'VANGL1', 'WEE1', 'WHSC1', 'XPO1', 'ZMYM1']

In [None]:
%%time
N = 1000#  18211 #  15_000# 10000 #

for l,str_inf in [ [G1S_genes_Tirosh, 'G1S Tirosh'], [G2M_genes_Tirosh, 'G2M Tirosh'],  [G1S_genes_Tirosh + G2M_genes_Tirosh, 'All Tirosh'],
                  [list_genes_fastCCsign, 'FastCC Signature'],
                  [ G1S_genes_Freeman, 'G1S Freeman' ],  [ G2M_genes_Freeman, 'G2M Freeman' ], [ G1S_genes_Freeman + G2M_genes_Freeman, 'All Freeman' ],   ]: 
    ll = set(l) & set(df_de_train.columns) 
    ll = list(ll)
    print(len(ll), str_inf )
    X = df_de_train[ll].T # .iloc[:,5:N].T
    print(X.shape)
    cm = np.corrcoef(X)
    print(cm[:3,:2])
    cm = np.abs(cm)
    cm = pd.DataFrame(cm, index = ll, columns = ll )
    print(cm.shape)
    clustergrid = sns.clustermap(cm)
    plt.title(str_inf, fontsize = 20 )
    plt.show()
    reordered_columns = clustergrid.dendrogram_col.reordered_ind
    reordered_rows = clustergrid.dendrogram_row.reordered_ind
    print(len(reordered_rows), len(reordered_columns) )
    print( list(cm.index[reordered_rows]) )
#     print( list(cm.columns[reordered_columns]) )

In [None]:
df_de_train

In [None]:
%%time
# for l,str_inf in [ [G1S_genes_Tirosh, 'G1S Tirosh'], [G2M_genes_Tirosh, 'G2M Tirosh'],  [G1S_genes_Tirosh + G2M_genes_Tirosh, 'All Tirosh'],
#                   [list_genes_fastCCsign, 'FastCC Signature'],
#                   [ G1S_genes_Freeman, 'G1S Freeman' ],  [ G2M_genes_Freeman, 'G2M Freeman' ], [ G1S_genes_Freeman + G2M_genes_Freeman, 'All Freeman' ],   ]: 
for l,str_inf in [  [G1S_genes_Tirosh + G2M_genes_Tirosh, 'All Tirosh']   ]: 
    ll = set(l) & set(df_de_train.columns) 
    ll = list(ll)
    print(len(ll), str_inf )
    X = df_de_train[ ['cell_type'] + list(df_de_train.columns[5:]) ].groupby('cell_type').median()
    X = X[ll]
    print(X.shape)
    clustergrid = sns.clustermap(X)# ,  annot=True, fmt=".2f", cmap="coolwarm" )
    plt.title(str_inf, fontsize = 20 )
    plt.show()
    reordered_columns = clustergrid.dendrogram_col.reordered_ind
    reordered_rows = clustergrid.dendrogram_row.reordered_ind
    print(len(reordered_rows), len(reordered_columns) )
    print( list(X.index[reordered_rows]) )
    print( list(X.columns[reordered_columns]) )
    
col = 'sm_name'    
for l,str_inf in [  [G1S_genes_Tirosh + G2M_genes_Tirosh, 'All Tirosh']   ]: 
    ll = set(l) & set(df_de_train.columns) 
    ll = list(ll)
    print(len(ll), str_inf )
    X = df_de_train[ [col] + list(df_de_train.columns[5:]) ].groupby(col).median()
    X = X[ll]
    print(X.shape)
    X.index = [t[:20] for t in X.index] # cut too long names
    clustergrid = sns.clustermap(X)# ,  annot=True, fmt=".2f", cmap="coolwarm" )
    plt.title(str_inf, fontsize = 20 )
    plt.show()    
    reordered_columns = clustergrid.dendrogram_col.reordered_ind
    reordered_rows = clustergrid.dendrogram_row.reordered_ind
    print(len(reordered_rows), len(reordered_columns) )
    print( list(X.index[reordered_rows]) )
    print( list(X.columns[reordered_columns]) )

# Look on compounds ( count = 146  )

15 compounds - 6 times data - only in train

In [None]:
d = df_de_train[['sm_name','sm_lincs_id','SMILES']].drop_duplicates()
print(d.shape)
d.to_csv('compounds.csv')
display( d.head(10) )

print(list(df_de_train['sm_name'].unique() ) )

In [None]:
l = [len(s) for s in df_de_train['SMILES']]
np.sort(list(set(l)) )

In [None]:
display( df_de_train['sm_name'].value_counts().head(20) )
display( df_de_train['sm_name'].value_counts().tail(10) )
df_de_train['sm_name'].value_counts().value_counts()

# Aggregations by compounds, cell_types 

It is used for prediction in early versions of the notebook

In [None]:
%%time
train_aggregate_mean_or_median_or_whatever = df_de_train.iloc[:,5:].quantile(0.7)# median()
train_aggregate_mean_or_median_or_whatever

In [None]:
%%time
d = train_aggregate_mean_or_median_or_whatever
plt.figure(figsize = (20,4) )
plt.plot(d.values)
plt.show()
plt.figure(figsize = (10,4) )
plt.hist(d.values, bins = 100)
plt.show()

display( d.describe() )

# Submission data

In [None]:
%%time
fn = '/kaggle/input/open-problems-single-cell-perturbations/id_map.csv'
df_id_map = pd.read_csv(fn)
print(df_id_map.shape)
display(df_id_map)
fn = '/kaggle/input/open-problems-single-cell-perturbations/sample_submission.csv'
df = pd.read_csv(fn, index_col = 0)
print(df.shape)
df

# Prepare submission by aggregated-target

## Key params

In [None]:
# predict_method = 'train_aggregation_by_compounds'
# predict_method = 'train_aggregation_by_compounds_with_denoising_pca'
# predict_method = 'train_aggregation_by_compounds_with_denoising_ICA'
predict_method = 'train_aggregation_by_compounds_with_denoising_TSVD'
quantile = 0.54
n_components = 35

## Prepare predictions by direct aggregation of targets by compounds

In [None]:
%%time
train_aggr_direct = df_de_train[ ['sm_name'] + list(df_de_train.columns[5:])  ].groupby('sm_name' ).quantile(quantile)# median()
train_aggr_direct

## Prepapre predictions by aggregation of PCA-reduced targets by compounds

In [None]:
%%time 
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD

Y = df_de_train.iloc[:,5:]
print(X.shape)
if '_pca' in predict_method:
    str_inf_target_dimred = 'PCA' 
    reducer = PCA(n_components=n_components )
elif '_ICA' in predict_method:
    str_inf_target_dimred = 'ICA' 
#     reducer = PCA(n_components=n_components )
    reducer = FastICA(n_components=n_components, random_state=0, whiten='unit-variance')
elif '_TSVD' in predict_method:
    str_inf_target_dimred = 'TSVD' 
#     reducer = PCA(n_components=n_components )
#     reducer = FastICA(n_components=n_components, random_state=0, whiten='unit-variance')
    reducer = TruncatedSVD(n_components=n_components, n_iter=7, random_state=42)
else:
    str_inf_target_dimred = ''
    
print(str_inf_target_dimred , reducer)
Yr = reducer.fit_transform(Y)
Yr_inv_trans = reducer.inverse_transform(Yr)
df_red_inv_trans = pd.DataFrame(Yr_inv_trans, columns = df_de_train.columns[5:])
df_red_inv_trans['sm_name'] = df_de_train['sm_name']

train_aggr_denoised = df_red_inv_trans.groupby('sm_name' ).quantile(quantile)# median()
train_aggr_denoised

In [None]:
%%time

if predict_method == 'train_aggregation_by_compounds':
    df = df_id_map.merge(train_aggr, how = 'left', on = 'sm_name')
    df = df.set_index('id').iloc[:,2:]
elif predict_method.startswith('train_aggregation_by_compounds_with_denoising_'):
    df = df_id_map.merge(train_aggr_denoised, how = 'left', on = 'sm_name')
    df = df.set_index('id').iloc[:,2:]
else:
    # consant for each target submission:
    for i,col in enumerate( df.columns ):
        df[col] = train_aggregate_mean_or_median_or_whatever[col]
        if (i%1000) == 0: print(i,col)
    
df

In [None]:
%%time
df

In [None]:
%%time
df.to_csv('submission.csv')

# Towards modeling

## Key params

In [None]:
n_components_for_cell_type_encoding = 1
n_components_for_compound_encoding = 25
alpha_regularization_for_linear_models = 10

# predict_method

model_type = 'Ridge'

In [None]:
str_model_id = model_type
str_model_id += ' nCT'+ str(n_components_for_cell_type_encoding)
str_model_id += ' nCD'+ str(n_components_for_compound_encoding)
str_model_id += ' Al'+ str(alpha_regularization_for_linear_models)
str_model_id += ' ' +str_inf_target_dimred+str( n_components )

print( str_model_id )

# Target encoded features

In [None]:
%%time
# Yr = reducer.fit_transform(X)
# n_components_for_cell_type_encoding = 10
df_tmp = pd.DataFrame(Yr[:, :n_components_for_cell_type_encoding  ], index = df_de_train.index  )
df_tmp['column for aggregation'] = df_de_train['cell_type']
df_cell_type_encoded = df_tmp.groupby('column for aggregation').quantile( quantile )
print('df_cell_type_encoded.shape', df_cell_type_encoded.shape )
display( df_cell_type_encoded )


# n_components_for_compound_encoding = 10
df_tmp = pd.DataFrame(Yr[:, :n_components_for_compound_encoding  ], index = df_de_train.index  )
df_tmp['column for aggregation'] = df_de_train['sm_name']
df_compound_encoded = df_tmp.groupby('column for aggregation').quantile( quantile )
print('df_compound_encoded.shape', df_compound_encoded.shape )
display( df_compound_encoded )

# Prepare X_train, X_submit - target encoded cell type and compound features

In [None]:
%%time
X_train = np.zeros( (len( df_de_train ) , n_components_for_cell_type_encoding + n_components_for_compound_encoding ))

for i in range(len( X_train )):
    cell_type = df_de_train['cell_type'].iat[i] 
    X_train[i,:n_components_for_cell_type_encoding] = df_cell_type_encoded.loc[cell_type,:].values  
    compound = df_de_train['sm_name'].iat[i] 
    X_train[i,n_components_for_cell_type_encoding:] = df_compound_encoded.loc[ compound, : ].values
print( X_train.shape)     
print( X_train)     
    

X_submit = np.zeros( (len( df_id_map ) , n_components_for_cell_type_encoding + n_components_for_compound_encoding ))
for i in range(len( X_submit )):
    cell_type = df_id_map['cell_type'].iat[i] 
    X_submit[i,:n_components_for_cell_type_encoding] = df_cell_type_encoded.loc[cell_type,:].values  
    compound = df_id_map['sm_name'].iat[i] 
    X_submit[i,n_components_for_cell_type_encoding:] = df_compound_encoded.loc[ compound, : ].values
    
    
print( X_submit.shape)     
print( X_submit)

# Modeling

In [None]:
%%time
from sklearn.linear_model import Ridge

model = Ridge(alpha=alpha_regularization_for_linear_models)
print(model)
model.fit(X_train, Yr)

Y_submit = reducer.inverse_transform(   model.predict(X_submit) )
print(Y_submit.shape)
Y_submit

# Save submission CSV

In [None]:
%%time
df_submit = pd.DataFrame(Y_submit, columns = df_de_train.columns[5:])
df_submit.index.name = 'id'
print( df_submit.shape )
display(df_submit)
df_submit.to_csv('submission.csv')

In [None]:
print('%.1f seconds passed total '%(time.time()-t0start) )
print('%.1f minutes passed total '%( (time.time()-t0start)/60)  )
print('%.2f hours passed total '%( (time.time()-t0start)/3600)  )