# Introduction

This notebook aims at exploring the possibilities offers to make feature selection based on the features cp_dose and cp_time.

The idea is that for a given target, dose and time shall be somehow correlated to active features for that target.

In [None]:
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import random
import matplotlib.pyplot as plt
from ipywidgets import interact
import seaborn as sns
import pandas as pd
import numpy as np

from tqdm import tqdm

import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from lightgbm import LGBMClassifier

In [None]:
df_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv').drop('sig_id', axis=1)
df_target = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv').drop('sig_id', axis=1)

In [None]:
GC = [col for col in df_train if ('g-' in col) | ('c-' in col)]

## Predict cp_dosage

### Trying to predict each target separately

In this part, I take only samples related to one target, and I try to predict the dose based on the other training features

In [None]:
rocs = {}
feature_importance = {}

for target in tqdm(df_target.columns):
    
    X = df_train.loc[df_target[target]==1, GC]
    y = df_train.loc[df_target[target]==1, 'cp_dose']
    lenc = LabelEncoder()
    y = lenc.fit_transform(y)
    
    if (len(X)>= 10) & (len(np.unique(y)) == 2):
        kf = StratifiedKFold(n_splits=5)
        preds = []
        feat_imp = []
        yy = []
        for train_ind, test_ind in kf.split(X,y):
            
            Xtrain, Xtest, ytrain, ytest = X.iloc[train_ind], X.iloc[test_ind], y[train_ind], y[test_ind]
            model = LGBMClassifier(n_estimators=100)
            model.fit(Xtrain, ytrain)
            p = model.predict_proba(Xtest)[:,1]
            preds.append(p)
            yy.append(ytest)
            feat_imp.append(model.feature_importances_)
            
        yy = np.hstack(yy)
        preds = np.hstack(preds)
        rocs[target] = roc_auc_score(yy, preds)
        feature_importance[target] = np.mean(feat_imp,axis=0)

#### AUC Scores

In [None]:
rocs_serie = pd.Series(rocs).sort_values(ascending = False)
fig = go.Figure(
    go.Bar(
        x = rocs_serie.index,
        y = rocs_serie.values
    )
)
fig.update_layout(template = 'presentation', title = 'AUC Score')
fig.show()

#### Feature importance for targets with AUC > 0.8

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA, KernelPCA

acc_cols = rocs_serie[rocs_serie>0.8].index
fi_dose = pd.DataFrame(feature_importance, index = GC)
fi_dose = fi_dose[acc_cols]

for col in fi_dose.columns:
    
    fig = go.Figure(
        go.Bar(
            x = fi_dose.index,
            y = fi_dose[col]
        )
    )
    fig.update_layout(template = 'presentation', title = col, height = 300)
    fig.show()

#### Top feature importance

In [None]:
top_fi = fi_dose.copy()
top_fi[top_fi<=10] = 0
top_fi[top_fi!=0] = 1
top_fi = top_fi.sum(axis=1)/len(top_fi.columns)
top_fi = top_fi[top_fi>0].sort_values(ascending = False)[:40]
fig = go.Figure(
    go.Bar(
        x = top_fi.index,
        y = top_fi.values
    )
)
fig.update_layout(template = 'presentation', title = 'Top Features to predict cp_dose', height = 300)
fig.show()

### Removing "obvious" features

From the figure above, we see that some features are presents for all targets.
To make sure that we keep only features relevant to each target, we run the same approach on all samples tagged as **cp_type = ctr_vehicle** and we will remove the active features from our training set

#### Model Training

In [None]:
X = df_train.loc[df_train.cp_type != 'trt_cp', GC]
y = df_train.loc[df_train.cp_type != 'trt_cp', 'cp_dose']
lenc = LabelEncoder()
y = lenc.fit_transform(y)

target = 'no_target'
if (len(X)> 10) & (len(np.unique(y)) == 2):

    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.2, stratify = y)
    model = LGBMClassifier(n_estimators=100)
    model.fit(Xtrain, ytrain)
    p = model.predict_proba(Xtest)[:,1]
    roc = roc_auc_score(ytest, p)
    
print(roc)

#### Feature importance

In [None]:
print(roc_auc_score(ytest, p))
print(accuracy_score(ytest, np.round(p)))

fi = pd.Series(model.feature_importances_, index = GC)
fi = fi[fi>10] # keep only high importance features
fi = fi.sort_values(ascending = False)
fig = go.Figure(
    go.Bar(
        x = fi.index,
        y = fi.values
    )
)
fig.update_layout(template = 'presentation', title = 'feature importance for cp_type != trt_cp')
fig.show()

So it appears here that the featuress g-307 and g-370 are actually very important, even when there is no actual drugs in the samples.
We will rerun the analysis on the targets an remove those "obvious" targets

#### Removing features

In [None]:
# Remove high feature importances
bfi = fi[fi>10].index
GC2 = [elmt for elmt in GC if elmt not in bfi]
print(len(GC))
print(len(GC2))

### Active targets

Now that we removed features good at predicting **ctr_vehicle** samples, we try to predict cp_dose for the samples of each target separately

#### Models training

In [None]:
rocs = {}

feature_importance = {}

for target in tqdm(df_target.columns):
    
    X = df_train.loc[df_target[target]==1, GC2]
    y = df_train.loc[df_target[target]==1, 'cp_dose']
    lenc = LabelEncoder()
    y = lenc.fit_transform(y)
    
    if (len(X)>= 10) & (len(np.unique(y)) == 2):
        kf = StratifiedKFold(n_splits=5)
        preds = []
        feat_imp = []
        yy = []
        for train_ind, test_ind in kf.split(X,y):
            
            Xtrain, Xtest, ytrain, ytest = X.iloc[train_ind], X.iloc[test_ind], y[train_ind], y[test_ind]
            model = LGBMClassifier(n_estimators=100)
            model.fit(Xtrain, ytrain)
            p = model.predict_proba(Xtest)[:,1]
            preds.append(p)
            yy.append(ytest)
            feat_imp.append(model.feature_importances_)
            
        yy = np.hstack(yy)
        preds = np.hstack(preds)
        rocs[target] = roc_auc_score(yy, preds)
        feature_importance[target] = np.mean(feat_imp,axis=0)

#### ROCs

About 30 set of samples reach a ROC > 0.8. For those samples, it is possible to predict correctly cp_dosage based only on the other training_features 

In [None]:
rocs_serie = pd.Series(rocs).sort_values(ascending = False)
fig = go.Figure(
    go.Bar(
        x = rocs_serie.index,
        y = rocs_serie.values
    )
)
fig.update_layout(template = 'presentation', title = 'AUC Score')
fig.show()

#### How many samples are included in that batch of high AUC? 

The below figure shows that this analysis might be worth for more than 60% of the total dataset

In [None]:
rep = df_target[rocs_serie.index].sum(axis=1).apply(lambda x:1 if x>0 else 0)
rep = rep[df_train.cp_type == 'trt_cp']
rep = pd.DataFrame(rep.groupby(rep).count()).rename(columns = {0: 'count'})
rep['names'] = ['with AUC < 0.8', 'with AUC > 0.8']
fig = px.pie(rep, values = 'count', names='names')
fig.update_layout(title = 'Repartion of samples by AUC for target cp_dose', template = 'presentation')

#### Feature importance

We select the sets of samples with ROC > 0.8 and plot the feature importance.
We notice that the important features are not the same for each target, giving a potential clue that these are the features reacting to the target.

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA, KernelPCA

acc_cols = rocs_serie[rocs_serie>0.8].index
fi_dose = pd.DataFrame(feature_importance, index = GC2)
fi_dose = fi_dose[acc_cols]

for col in fi_dose.columns:
    
    fig = go.Figure(
        go.Bar(
            x = fi_dose.index,
            y = fi_dose[col]
        )
    )
    fig.update_layout(template = 'presentation', title = col, height = 300)
    fig.show()

#### What features comes back the more ?

We see here that after removing the "obvious" features, the frequency of apparition of top features is much lower, which mean that each target has its own feature importance.
Those might be well related to the the target themselves

In [None]:
top_fi = fi_dose.copy()
top_fi[top_fi<=10] = 0
top_fi[top_fi!=0] = 1
top_fi = top_fi.sum(axis=1)/len(top_fi.columns)
top_fi = top_fi[top_fi>0].sort_values(ascending = False)[:40]
fig = go.Figure(
    go.Bar(
        x = top_fi.index,
        y = top_fi.values
    )
)
fig.update_layout(template = 'presentation', title = '% of time a feature is important for the remaining targets', height = 300)
fig.show()

## Predict cp_time

We restart as above, but this time trying to predict the cp_time feature. I start directly by removing "obvious" targets for cp_time

### No target (cp_type == ctr_vehicle)

In [None]:
X = df_train.loc[df_train.cp_type != 'trt_cp', GC]
y = df_train.loc[df_train.cp_type != 'trt_cp', 'cp_time']
lenc = LabelEncoder()
y = lenc.fit_transform(y)

target = 'no_target'
if (len(X)> 10) & (len(np.unique(y)) == 3):
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.2, stratify = y)
    model = LGBMClassifier()
    model.fit(Xtrain, ytrain)
    p = model.predict_proba(Xtest)
    roc = roc_auc_score(ytest, p, multi_class = 'ovr')
    feature_importance = model.feature_importances_
    
print(roc)

#### Feature importance (ctr_vehicle)

In [None]:
fi = pd.Series(model.feature_importances_, index = GC)
fi = fi.sort_values(ascending = False)
fig = go.Figure(
    go.Bar(
        x = fi.index,
        y = fi.values
    )
)
fig.show()

#### Removing features

In [None]:
# Remove high feature importances
bfi = fi[fi>10].index
GC2 = [elmt for elmt in GC if elmt not in bfi]
print(len(GC))
print(len(GC2))

### Predict cp_time for real targets

In [None]:
rocs = {}

feature_importance = {}

for target in tqdm(df_target.columns):
    
    X = df_train.loc[df_target[target]==1, GC2]
    y = df_train.loc[df_target[target]==1, 'cp_time']
    lenc = LabelEncoder()
    y = lenc.fit_transform(y)
    
    if (len(X)>= 15) & (len(np.unique(y)) == 3):
        kf = StratifiedKFold(n_splits=5)
        preds = []
        feat_imp = []
        yy = []
        for train_ind, test_ind in kf.split(X,y):
            
            Xtrain, Xtest, ytrain, ytest = X.iloc[train_ind], X.iloc[test_ind], y[train_ind], y[test_ind]
            model = LGBMClassifier(n_estimators=100)
            model.fit(Xtrain, ytrain)
            p = model.predict_proba(Xtest)
            preds.append(p)
            yy.append(ytest)
            feat_imp.append(model.feature_importances_)
            
        yy = np.hstack(yy)
        preds = np.vstack(preds)
        rocs[target] = roc_auc_score(yy, preds, multi_class='ovr')
        feature_importance[target] = np.mean(feat_imp,axis=0)

#### ROC

In [None]:
rocs_serie = pd.Series(rocs).sort_values(ascending = False)
fig = go.Figure(
    go.Bar(
        x = rocs_serie.index,
        y = rocs_serie.values
    )
)
fig.update_layout(template = 'presentation', title = 'AUC Score')
fig.show()

#### Feature Importance

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA, KernelPCA

acc_cols = rocs_serie[rocs_serie>0.8].index
fi_time = pd.DataFrame(feature_importance, index = GC2)
fi_time = fi_time[acc_cols]

for col in fi_time.columns:
    
    fig = go.Figure(
        go.Bar(
            x = fi_time.index,
            y = fi_time[col]
        )
    )
    fig.update_layout(template = 'presentation', title = col, height = 300)
    fig.show()

#### What features comes back the more ?

We see here that after removing the "obvious" features, the frequency of apparition of top features is much lower, which mean that each target has its own feature importance.
Those might be well related to the the target themselves

In [None]:
top_fi = fi_time.copy()
top_fi[top_fi<=10] = 0
top_fi[top_fi!=0] = 1
top_fi = top_fi.sum(axis=1)/len(top_fi.columns)
top_fi = top_fi[top_fi>0].sort_values(ascending = False)[:40]
fig = go.Figure(
    go.Bar(
        x = top_fi.index,
        y = top_fi.values
    )
)
fig.update_layout(template = 'presentation', title = '% of time a feature is important for the remaining targets', height = 300)
fig.show()

## Are the same targets detected ?

Variations in **cp_dosage** seems easier to predict compared to **cp_time**
On the other end, we see that targets with high ROC score for **cp_time** are all included in targets with high ROC for **cp_dosage**

In [None]:
print(f'total number of targets for target = dose : {len(fi_dose.columns)}')
print(fi_dose.columns)
print('\n')
print(f'total number of targets for target = time : {len(fi_time.columns)}')
print(fi_time.columns)
print('\n')
common = list(set(fi_dose.columns).intersection(set(fi_time.columns)))
print('\n')
print('targets in common in both sets:')
print(common)

## Are the features extracted the same ?

In [None]:
for col in common:
    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x = fi_dose.index,
            y = fi_dose[col],
            name = 'dose'
        )   
    )
    fig.add_trace(
        go.Bar(
            x = fi_time.index,
            y = fi_time[col],
            name = 'time'
        )   
    )
    fig.update_layout(template = 'presentation', height = 300, title = col)
    fig.show()

# Using this method to remove outliers ?

Let's visualise the PCA of the features selected by our cp_dose classifier for each target

In [None]:
for target in fi_dose.columns:
    try:
        plt.figure(figsize = (25,5))
        plt.subplot(1,2,1)
        plt.title(target+ 'importance > 10')
        test = fi_dose[target]
        features = test[test>10].index

        sub = df_train.loc[(df_target[target]==1), features]
        pca = PCA(2)
        pca.fit(sub)

        
        for d in df_train.cp_dose.unique():
            sub_df = df_train.loc[(df_target[target]==1) & (df_train.cp_dose==d), features]
            xpca = pca.transform(sub_df)
            plt.scatter(xpca[:,0],xpca[:,1], label = d)
        plt.legend()
        plt.subplot(1,2,2)
        test = fi_dose[target]
        features = test[test>5].index

        sub = df_train.loc[(df_target[target]==1), features]
        pca = PCA(2)
        pca.fit(sub)

        plt.title(target + 'importance > 5')
        for d in df_train.cp_dose.unique():
            sub_df = df_train.loc[(df_target[target]==1) & (df_train.cp_dose==d), features]
            xpca = pca.transform(sub_df)
            plt.scatter(xpca[:,0],xpca[:,1], label = d)
        plt.legend()
        plt.show()
    except:
        pass