In [None]:
#import Librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import  KFold , GridSearchCV, train_test_split
from sklearn.ensemble import  RandomForestClassifier
import random

> # Read data

In [None]:

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_features.drop(['sig_id'], axis=1, inplace=True)

train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
sig_id= train_targets['sig_id']
train_targets.drop(['sig_id'], axis=1, inplace=True)


test_features = pd.read_csv('../input/lish-moa/test_features.csv')
test_features.drop(['sig_id'], axis=1, inplace=True)
#sample_submission  = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
train_features.shape,test_features.shape , train_targets.shape

# feature engerennig -EDA

In [None]:
plt.hist(train_targets.mean())
#fonction d'assistance pour tracer la distribution des données d'expression génique et de viabilité cellulaire

In [None]:
#list afin de tracer des countplots pour le nombre des entités catégorielles 
catList = ['cp_type', 'cp_dose']
countList =  list (set(train_features.columns) - set(catList))

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(8, 8))

for i, ax in enumerate(fig.axes):
    if i < len(catList):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)
        sns.countplot(x=catList[i], alpha=0.7, data=train_features, ax=ax)

fig.tight_layout()

In [None]:

fig, axes = plt.subplots(1, 1, figsize=(8, 8))

for i, ax in enumerate(fig.axes):

    ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)

    sns.countplot(x='cp_time', alpha=0.7, data=test_features, ax=ax)

In [None]:
#Analyse de l'importance des fonctionnalités en fonction de la cible (target_scored)
train_targets.mean()[train_targets.mean() == train_targets.mean().max()]

In [None]:
# fonction qui renvoie des statistiques de valeur  triées 
#des colonnes d'expression génique (g-) et des colonnes de viabilité cellulaire (c-)
def getContinuousVariablesRanking(df, target):
    cont_vars = [i for i in list(df.dtypes[df.dtypes != 'object'].index) if i != target]

    Fvalue_selector = SelectKBest(f_classif, k=len(cont_vars))
    Fvalue_selector.fit_transform(df[cont_vars].fillna(-1), df[target])
    df_Fvalue = pd.DataFrame(Fvalue_selector.scores_,
                             columns=['scaled_importance'])
    # scaling the statistics
    df_Fvalue -= df_Fvalue.min()
    df_Fvalue /= df_Fvalue.max()
    df_Fvalue['columns'] = cont_vars
    df_Fvalue.sort_values(by='scaled_importance', ascending=False, inplace=True)
    
    return df_Fvalue

In [None]:
features_toptarget_count = pd.concat([train_features[countList], train_targets['nfkb_inhibitor'].astype('str')], axis = 1)

df_value = getContinuousVariablesRanking(features_toptarget_count, 'nfkb_inhibitor')

df_value

In [None]:
colist = df_value.head(10)['columns']
label = 'nfkb_inhibitor'
for col in colist: 
    
    g = sns.FacetGrid(features_toptarget_count[[col, label]],  hue =label, height = 4, aspect = 1.5) 
    g.map(sns.distplot, col, hist = False, kde_kws = {'shade': True, 'linewidth': 3}).set_axis_labels(col,"density").add_legend()

In [None]:
# séparation des colonnes (g-) et de  (c-)
gene_cols = [col for col in train_features.columns if col.startswith('g-')]
cell_cols = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
# une matrice de corrélation pour des entités sélectionnées au hasard
selected_cols = random.sample(gene_cols, 10) + random.sample(cell_cols, 10)
corr_selected_cols = train_features[selected_cols].corr()
plt.figure(figsize = (13,13))
sns.heatmap(corr_selected_cols, cmap="YlGnBu", annot = True, square = True)
plt.title('Corrélation entre un échantillon aléatoire de g- et de c-');

In [None]:
train_targets.head()

In [None]:
#la somme par colonne de toutes les cibles (nombre de réponses positives pour chaque cible)
x = train_targets.sum().sort_values().reset_index()
x.columns = ['target','responses']
x

In [None]:
# tracer des classes cibles avec le plus grand nombre de réponses positives
fig = plt.figure(figsize = (10,10))
plt.title('Target classes with highest number of positive responses')
ax = sns.barplot(x = 'target', y = 'responses', data = x.tail(20))
ax.set_xticklabels(x.tail(20).target, rotation = 90);

In [None]:
# tracer des classes cibles avec le plus petit nombre de réponses positives
fig = plt.figure(figsize = (10,10))
plt.title('Target classes with lowest number of positive responses')
ax = sns.barplot(x = 'target', y = 'responses', data = x.head(20))
ax.set_xticklabels(x.head(20).target, rotation = 90);

In [None]:
# tracer du nombre de réponses positives pour chaque classe cible
fig = plt.figure(figsize = (17,17))
plt.title('Number of positive responses for each target')
ax = sns.barplot(x = 'target', y = 'responses', data = x)
ax.set_xticklabels(x.target, rotation = 90);

In [None]:
# la somme par ligne de tous les sig_id (nombre d'activations par sig_id)
y = train_targets.sum(axis = 1)
y.value_counts()

In [None]:
ax = sns.countplot(y, palette = 'pastel')
plt.title('Number of activations');
total = len(y)
for p in ax.patches:
    ht = p.get_height()
    ax.text(p.get_x(), ht, '{:1.2f}%'.format(ht*100/total))

# training avec le modèle de réseau neuronal simple

preparation des données

In [None]:
# transformation de données train_features 
#colonne cp_dose en une valeur numérique
train_features['cp_dose'] = train_features['cp_dose'].map({'D1':'0',"D2":'1'})
#cp_type column into a numeric value
s= []
for i in range(23814):
    if train_features['cp_type'][i] == "trt_cp":
        s.append(0)
    else :
        s.append(1)
train_features['cp_type']=s

In [None]:
# transformation de données test_features 
#colonne cp_dose en une valeur numérique
test_features['cp_dose'] = test_features['cp_dose'].map({'D1':'0',"D2":'1'})
#cp_type column into a numeric value
k= []
for i in range(3982):
    if test_features['cp_type'][i] == "trt_cp":
        k.append(0)
    else :
        k.append(1)
test_features['cp_type']=k

In [None]:
test_features.head()

In [None]:
train_features.head()

In [None]:
train = train_features.columns.tolist()
target = train_targets.columns.tolist()

In [None]:
train_feature = train_features.T.fillna(train_features.mean(axis=0)).T
train_feature
X_train = np.asarray(train_feature[train].values,dtype ='float32')
y_train = np.asarray(train_targets[target].values,dtype='float32')

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Activation,Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam,RMSprop

**le training**

In [None]:
def model_network():
    X_input = Input(shape = (875,))
    X = BatchNormalization()(X_input)
    X = Dense(units=2048, kernel_initializer = 'he_uniform')(X)
    X = tf.keras.layers.LeakyReLU(alpha=0.2)(X)
    X = BatchNormalization()(X)
    X = Dense(units =1024,kernel_initializer = 'he_uniform')(X)
    X = tf.keras.layers.LeakyReLU(alpha=0.2)(X)
    X = Dense(units =512,kernel_initializer = 'glorot_uniform')(X)
    X = Activation('tanh')(X)
    X =Dropout(0.2)(X)
    X_output = Dense(206,activation = 'sigmoid')(X)
    model = Model(inputs = X_input,outputs = X_output)
    return model
from functools import partial
RegularizedDense = partial(tf.keras.layers.Dense,activation = 'relu',kernel_initializer = 'he_normal',kernel_regularizer = tf.keras.regularizers.l2(0.0001))
def model2_network():
    X_input = X_input = Input(shape = (875,))
    X = BatchNormalization()(X_input)
    X = RegularizedDense(1024)(X)
    X = RegularizedDense(1024)(X)
    X =Dropout(0.2)(X)
    X_output = Dense(206,activation = 'sigmoid')(X)
    model = Model(inputs = X_input,outputs = X_output)
    return model

In [None]:
model = model_network()
model.compile(optimizer = RMSprop(0.0001), loss = BinaryCrossentropy())

In [None]:
model_final = model2_network()
model_final.compile(optimizer = Adam(), loss = BinaryCrossentropy())
model.fit(X_train,y_train,batch_size =256, epochs=15)
X_test = np.asarray(test_features[train].values, dtype = 'float32')
predictions = model.predict(X_test)

In [None]:
predictions

In [None]:
data = pd.DataFrame.from_records(predictions)
data.columns = train_targets.columns
data.insert(0,'sig_id',sig_id)
data.head()

In [None]:
data.to_csv('submission.csv', index=False)