In [None]:
!pip uninstall fastai -y
#!pip install -q fastai==1.0.61

In [None]:
import sys,os,gc

In [None]:
sys.path.append('../input/deepinsight')
sys.path.append('../input/fastaiv1')

In [None]:
from fastai.vision import *
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fastai.callbacks import *
from tqdm.notebook import tqdm
from ml_stratifiers import MultilabelStratifiedKFold;
from pyDeepInsight import ImageTransformer, LogScaler
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [None]:
def make_folds(folds = 5, random_state = 0, stratify = True, scored = None):
    
    drug = pd.read_csv('../input/lish-moa/train_drug.csv')
    if scored is None:
        scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left')

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc<=18].index.sort_values()
    vc2 = vc.loc[vc>18].index.sort_values()

    # STRATIFY DRUGS 18 OR LESS
    dct1 = {}; dct2 = {}
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)
    
    # STRATIFY DRUGS MORE THAN 18
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)
    
    # ASSIGN FOLDS
    scored['fold'] = np.nan
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(),'fold'] = scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')
    
    return scored[['sig_id','fold']].copy()

In [None]:
targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
train_features = pd.concat([train_features, pd.get_dummies(train_features['cp_time'], prefix='cp_time')], axis=1)
train_features = pd.concat([train_features, pd.get_dummies(train_features['cp_dose'], prefix='cp_dose')], axis=1)
train_features = pd.concat([train_features, pd.get_dummies(train_features['cp_type'], prefix='cp_type')], axis=1)
train_features = train_features.drop(['cp_type', 'cp_time', 'cp_dose'], axis=1)

In [None]:
test_features = pd.concat([test_features, pd.get_dummies(test_features['cp_time'], prefix='cp_time')], axis=1)
test_features = pd.concat([test_features, pd.get_dummies(test_features['cp_dose'], prefix='cp_dose')], axis=1)
test_features = pd.concat([test_features, pd.get_dummies(test_features['cp_type'], prefix='cp_type')], axis=1)
test_features = test_features.drop(['cp_type', 'cp_time', 'cp_dose'], axis=1)

In [None]:
sig = lambda x : 1000/(1+np.exp(-x/10)) # Scaling Negative values using custom sigmoid

In [None]:
cols = train_features.columns.tolist()[1:]
cat_names = ['cp_type', 'cp_time', 'cp_dose']

In [None]:
X_train = train_features[cols].copy()
X_test = test_features[cols].copy()
X_test = sig(X_test)
X_train = sig(X_train)

In [None]:
ln = LogScaler()
X_train_norm = ln.fit_transform(X_train)

In [None]:
X_test_norm = ln.transform(X_test)

In [None]:
it = ImageTransformer(feature_extractor='tsne', 
                      pixels=50, random_state=2020, 
                      n_jobs=-1)

In [None]:
plt.figure(figsize=(5, 5))
it.fit(X_train_norm, plot=True);

In [None]:
fdm = it.feature_density_matrix()
fdm[fdm == 0] = np.nan

plt.figure(figsize=(10, 7))

ax = sns.heatmap(fdm, cmap="viridis", linewidths=0.01, 
                 linecolor="lightgrey", square=True)
ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
ax.yaxis.set_major_locator(ticker.MultipleLocator(5))
for _, spine in ax.spines.items():
    spine.set_visible(True)

In [None]:
px_sizes = [25, (25, 50), 50, 100]

fig, ax = plt.subplots(1, len(px_sizes), figsize=(25, 7))
for ix, px in enumerate(px_sizes):
    it.pixels = px
    fdm = it.feature_density_matrix()
    fdm[fdm == 0] = np.nan
    cax = sns.heatmap(fdm, cmap="viridis", linewidth=0.01, 
                      linecolor="lightgrey", square=True, 
                      ax=ax[ix], cbar=False)
    cax.set_title('Dim {} x {}'.format(*it.pixels))
    for _, spine in cax.spines.items():
        spine.set_visible(True)
    cax.xaxis.set_major_locator(ticker.MultipleLocator(5))
    cax.yaxis.set_major_locator(ticker.MultipleLocator(5))
plt.tight_layout()    
    
it.pixels = 50

In [None]:
mat_train = it.fit_transform(X_train_norm)
mat_test = it.fit_transform(X_test_norm)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(25, 7))
for i in range(0,4):
    cax = sns.heatmap(mat_train[i], cmap='viridis',
                      linewidth=0.01, linecolor='dimgrey',
                      square=True, ax=ax[i], cbar=False)
    cax.axis('off')
plt.tight_layout()

In [None]:
from PIL import Image

In [None]:
len(mat_train)

In [None]:
os.mkdir('/kaggle/working/image_data_train')
os.mkdir('/kaggle/working/image_data_test')

In [None]:
fnames = train_features['sig_id'].values.tolist()

In [None]:
for i in tqdm(range(len(mat_train))):
    zimg = (mat_train[i] * 255.999).astype(np.uint8)
    zimg = np.dstack((zimg,zimg,zimg))
    img = Image.fromarray(zimg,mode='RGB')
    img.save('/kaggle/working/image_data_train/'+str(fnames[i])+'.png')

In [None]:
y_fnames = test_features['sig_id'].values.tolist()

In [None]:
for j in tqdm(range(len(mat_test))):
    zimg = (mat_test[j] * 255.999).astype(np.uint8)
    zimg = np.dstack((zimg,zimg,zimg))
    img = Image.fromarray(zimg,mode='RGB')
    img.save('/kaggle/working/image_data_test/'+str(y_fnames[j])+'.png')

In [None]:
tar = train_targets_scored.columns.tolist()[1:]

In [None]:
df = train_targets_scored.copy()
FOLDS = 10; SEED = 42
ff = make_folds(folds=FOLDS, random_state=SEED, stratify=True, scored=targets)
df['kfold'] = ff.fold.values

In [None]:
def get_data(fold):
    val_idx,trn_idx = df[df.kfold==fold].index,df[df.kfold!=fold].index
    ts = ImageList.from_df(test_features,path='/kaggle/working',cols='sig_id',folder='image_data_test',suffix='.png')
    data = (ImageList.from_df(df,path='/kaggle/working',cols='sig_id',folder='image_data_train',suffix='.png')
                    .split_by_idxs(train_idx=trn_idx,valid_idx=val_idx)
                    .label_from_df(cols=tar,label_cls = MultiCategoryList, one_hot = True)
                    .add_test(ts)
                    .databunch(bs=64).normalize(imagenet_stats))
    return data

In [None]:
test_sc = []
for fold in tqdm(range(FOLDS)):
    data = get_data(fold)
    
    learn = cnn_learner(data, models.resnet34 ,loss_func=BCEWithLogitsFlat(),pretrained=True)
    
    name = 'best_model_'+str(fold)
    
    cb1 = SaveModelCallback(learn,monitor='valid_loss',name=name,mode='min',every='improvement')
    cb2 = ReduceLROnPlateauCallback(learn,monitor='valid_loss',mode='min',patience=2,factor=0.2)
    
    lr = 1e-2
    learn.fit_one_cycle(20, slice(lr/(2.6**4),lr),callbacks=[cb1,cb2]) 
    
    learn.load(name);
    sub = learn.get_preds(DatasetType.Test)
    test_sc.append(sub[0].numpy())
    
    learn,data=None,None
    gc.collect()
    
test_sc = np.array(test_sc)

In [None]:
avg_prds = test_sc.mean(axis=0)

In [None]:
cols = train_targets_scored.columns.tolist()[1:]

In [None]:
test_fea = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

In [None]:
submission = sample_submission.copy()
submission[cols] = avg_prds
submission.loc[submission['sig_id'].isin(test_fea.loc[test_fea['cp_type'] =='ctl_vehicle', 'sig_id']), train_targets_scored.columns[1:]] = 0
submission['atp-sensitive_potassium_channel_antagonist'] = 0
submission['erbb2_inhibitor'] = 0

In [None]:
results = submission.copy()
for cl in cols:
    results[cl].clip(0.0002, 0.999, inplace = True)
results.to_csv('submission.csv',index=False)

In [None]:
tst_rm = get_image_files('/kaggle/working/image_data_test/')
trn_rm = get_image_files('/kaggle/working/image_data_train/')

In [None]:
for fl in trn_rm:
    os.remove(fl)
for fl in tst_rm:
    os.remove(fl)