In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Загрузим и посмотрим датасеты

In [None]:
train_df = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_df_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_df_noscored = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")

test_df = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")
sample_sub = pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df_scored.head()

In [None]:
train_df_scored.info()

# Сделаем небольшой EDA

Посмотрим сначала на фичи, которые выделяются от остальных (категориальные):

In [None]:
train_df['cp_type'].value_counts()

In [None]:
train_df['cp_dose'].value_counts()

In [None]:
train_df['cp_time'].value_counts()

Теперь посмотрим на таргет. Сначала - сумму по столбцам, потом - по строкам

In [None]:
train_df_scored.iloc[:, 1:].sum(axis = 0).sort_values()

In [None]:
train_df_scored.iloc[:, 1:].sum(axis = 1).value_counts()

Видим, что у большинства объектов только одна 1, но есть и много тех, у которых 1 нет вообще. Посмотрим на них

In [None]:
train_df[train_df_scored.iloc[:, 1:].sum(axis = 1) == 0][['cp_type']].value_counts()/len(train_df)

In [None]:
train_df[['cp_type']].value_counts()/len(train_df)

Видим, что если cp_type = 'ctl_vehicle', то нули во всех столбцах. От этой фичи можно избавиться.

In [None]:
train_df[train_df_scored.iloc[:, 1:].sum(axis = 1) == 0][['cp_dose']].value_counts()/len(train_df)

In [None]:
train_df[['cp_dose']].value_counts()/len(train_df)

In [None]:
train_df[train_df_scored.iloc[:, 1:].sum(axis = 1) == 0][['cp_time']].value_counts()/len(train_df)

In [None]:
train_df[['cp_time']].value_counts()/len(train_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize=(12, 60))

sns.barplot(x=train_df_scored.drop('sig_id', axis = 1).sum(axis = 0).sort_values().values,
            y=train_df_scored.drop('sig_id', axis = 1).sum(axis = 0).sort_values().index)

plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.xlabel('')
plt.ylabel('')
plt.title('Число 1 в различных таргетах', size=18, pad=18)

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
corr = train_df_scored.iloc[:, 1:].corr()
corr[corr>=.5]

In [None]:
import matplotlib.pyplot as plt

corr = train_df_scored.iloc[:, 1:].corr()

f = plt.figure(figsize=(50, 50))

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
train_df_scored.iloc[:, 1:].corr()

In [None]:
sns.set()

sns.set_style('whitegrid')
sns.set_context('talk')
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (30, 10),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'}

plt.rcParams.update(params)

In [None]:
train_df_scored.sum(axis = 1).value_counts().plot(kind='bar', title = "Число ненулевых значений таргета", xlabel = "количество 1 в таргете");

In [None]:
train_df['is_all_targets_null'] = (train_df_scored.sum(axis = 1) == 0).astype(int)

In [None]:
train_df.head()

In [None]:
train_df[['cp_type', 'is_all_targets_null']].groupby('cp_type').sum()

In [None]:
len(train_df[train_df['cp_type'] == 'ctl_vehicle'])/len(train_df)

In [None]:
plt.style.use('default')
sns.pairplot(train_df[['is_all_targets_null', 'cp_time', 'g-1']].sample(n = 1000), height=2.5, hue="is_all_targets_null") 

In [None]:
targets = pd.DataFrame(train_df_scored.iloc[:, 1:].columns)

In [None]:
df_targ = pd.DataFrame(train_df_scored.iloc[:, 1:].sum(axis = 0)).reset_index()

def make_groups(x):
    if 'inhibitor' in x: 
        return 'is_inhibitor'
    elif 'agonist' in x:
        return 'is_agonist'
    elif 'antagonist' in x:
        return 'is_antagonist'
    elif 'activator' in x:
        return 'is_activator'
    elif 'blocker' in x:
        return 'is_blocker'
    else:
        return 'else'
            
            
df_targ.columns = ['name', 'number']
df_targ['group'] = df_targ['name'].apply(lambda x: make_groups(x))

In [None]:
make_groups('acetylcholine_receptor_agonist')

In [None]:
df_targ['group'].value_counts()

In [None]:
plt.style.use('default')

In [None]:
df_targ.groupby('group').sum().plot(kind = 'bar', title = 'число 1 по типам')

In [None]:
train_df['group'] = df_targ['group']

In [None]:
i

In [None]:
for i in ['is_inhibitor', 'is_agonist', 'is_antagonist', 'is_activator', 'is_blocker', 'else_col']:
    train_df_scored[i] = 0


    

for i in train_df_scored.iloc[:, 1:].columns:
    if 'inhibitor' in i: 
        train_df_scored['is_inhibitor'] += train_df_scored[i]
    if 'agonist' in i:
        train_df_scored['is_agonist'] += train_df_scored[i]
    if 'antagonist' in i:
        train_df_scored['is_antagonist'] += train_df_scored[i]
    if 'activator' in i:
        train_df_scored['is_activator'] += train_df_scored[i]
    if 'blocker' in i:
        train_df_scored['is_blocker'] += train_df_scored[i]
    else:
        train_df_scored['else_col'] = train_df_scored['else_col'] + train_df_scored[i]

In [None]:
for i in ['is_inhibitor', 'is_agonist', 'is_antagonist', 'is_activator', 'is_blocker', 'else_col']:
    train_df_scored[i] = train_df_scored[i].apply(lambda x: 1 if x >= 1 else 0)

In [None]:
for i in ['is_inhibitor', 'is_agonist', 'is_antagonist', 'is_activator', 'is_blocker', 'else_col']:
    train_df[i] = train_df_scored[i]

In [None]:
sns.pairplot(train_df[['is_inhibitor', 'g-0', 'g-1']].sample(n = 4000), height=2.5, hue="is_inhibitor") 

In [None]:
train_df['is_inhibitor']

In [None]:
import umap

In [None]:
train_df_sample = train_df.sample(n = 5000)
embedding = umap.UMAP(n_neighbors=5).fit_transform(train_df_sample.iloc[:, 4:104])

for i in ['is_inhibitor', 'is_agonist', 'is_antagonist', 'is_activator', 'is_blocker', 'else_col']:
    plt.figure()
    plt.scatter(embedding[:, 0],embedding[:, 1], s= 5, c=train_df_sample[i], cmap='Spectral',label=i)
    plt.title('Embedding of the g-XX features on training set by UMAP', fontsize=13);
    plt.legend();

In [None]:
train_df_sample = train_df.sample(n = 5000)
embedding = umap.UMAP(n_neighbors=5).fit_transform(train_df_sample.iloc[:, 4:104])

for i in ['is_inhibitor', 'is_agonist', 'is_antagonist', 'is_activator', 'is_blocker', 'else_col']:
    plt.figure()
    plt.scatter(embedding[:, 0],embedding[:, 1], s= 5, c=train_df_sample[i], cmap='Spectral',label=i)
    plt.title('Embedding of the g-XX features on training set by UMAP', fontsize=13);
    plt.legend();

In [None]:
train_df_sample = train_df.sample(n = 5000)
embedding = umap.UMAP(n_neighbors=5).fit_transform(train_df_sample.iloc[:, 776:876])

for i in ['is_inhibitor', 'is_agonist', 'is_antagonist', 'is_activator', 'is_blocker', 'else_col']:
    plt.figure()
    plt.scatter(embedding[:, 0],embedding[:, 1], s= 5, c=train_df_sample[i], cmap='Spectral',label=i)
    plt.title('Embedding of the c-XX features on training set by UMAP', fontsize=13);
    plt.legend();

# Сделаем простенькую модель

In [None]:
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from category_encoders import CountEncoder

In [None]:
X = train_df.drop(['sig_id'] , axis = 1)
X_test_big = test_df.drop(['sig_id'], axis = 1)

y = train_df_scored.drop('sig_id', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
X = CountEncoder(cols=['cp_type', 'cp_dose']).fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.describe()

In [None]:
y_train['5-alpha_reductase_inhibitor'].value_counts()

In [None]:
model = CatBoostClassifier(n_estimators = 50).fit(X_train, y_train['5-alpha_reductase_inhibitor'])

In [None]:
pd.Series(model.predict(X_test)).value_counts()

In [None]:
classifier = MultiOutputClassifier(CatBoostClassifier(n_estimators = 10, max_depth = 10))

clf = Pipeline([('encode', CountEncoder(cols=[0, 2])),
                ('classify', classifier)
               ])

In [None]:
%%time 
clf.fit(X.values, y.values)

In [None]:
clf.predict_proba(X_test)

In [None]:
def build_train(resume_models = None, repeat_number = 0, folds = 5, skip_folds = 0):
    
    models = []
    oof_preds = y_train.copy()
    

    kfold = KFold(folds, shuffle = True)
    
    for fold, (train_ind, val_ind) in enumerate(kfold.split(x_train)):
        print('\n')
        print('---------------------------------------------------------')
        print(f'Fold number {fold + 1}')
        
        cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'binary_crossentropy', factor = 0.4, patience = 2, verbose = 1, min_delta = 0.0001, mode = 'auto')
        checkpoint_path = f'repeat:{repeat_number}_Fold:{fold}.hdf5'
        cb_checkpt = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 0, save_best_only = True, save_weights_only = True, mode = 'min')

        model = create_model()
        model.fit(x_train.values[train_ind],
              y_train.values[train_ind],
              validation_data=(x_train.values[val_ind], y_train.values[val_ind]),
              callbacks = [cb_lr_schedule, cb_checkpt],
              epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2
             )
        model.load_weights(checkpoint_path)
        oof_preds.loc[val_ind, :] = model.predict(x_train.values[val_ind])
        models.append(model)

    return models, oof_preds