In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import skew
import pylab 
import itertools

from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers.core import Dense, Flatten, Dropout, Lambda
from keras.callbacks import ReduceLROnPlateau

from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Data loading

In [None]:
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

In [None]:
test_features.head()

In [None]:
train_features.head()

In [None]:
print(train_features.shape[0])
print(test_features.shape[0])

print(train_features.shape[1])
print(test_features.shape[1])

train_features.shape[0] / test_features.shape[0]

# Missing values are missing

In [None]:
missing_vals_train  = train_features.isnull().sum() / train_features.shape[0]
missing_vals_train[missing_vals_train > 0].sort_values(ascending=False) 

In [None]:
missing_vals_test  = test_features.isnull().sum() / test_features.shape[0]
missing_vals_test[missing_vals_test > 0].sort_values(ascending=False) 

In [None]:
train_features.info()

In [None]:
test_features.info()

# Categorical variables

In [None]:
train_features[['cp_time']] = train_features[['cp_time']].astype('object')
test_features[['cp_time']] = test_features[['cp_time']].astype('object')

In [None]:
train_features_object = train_features.select_dtypes(include = ['object'])
test_features_object = test_features.select_dtypes(include = ['object'])

In [None]:
print(train_features_object.shape)
train_features_object.head()

## cp_type

In [None]:
train_features_object_group_cp_type = train_features_object.groupby('cp_type').aggregate({'sig_id': 'count'}).reset_index()
train_features_object_group_cp_type['train/test'] = ['train', 'train']

test_features_object_group_cp_type = test_features_object.groupby('cp_type').aggregate({'sig_id': 'count'}).reset_index()
test_features_object_group_cp_type['train/test'] = ['test', 'test']

group_cp_type = pd.concat([train_features_object_group_cp_type, test_features_object_group_cp_type])
group_cp_type.head()

In [None]:
fig = px.bar(group_cp_type, x="cp_type", y="sig_id", color="train/test", title="cp_type")
fig.show()

## cp_dose

In [None]:
train_features_object_group_cp_dose = train_features_object.groupby('cp_dose').aggregate({'sig_id': 'count'}).reset_index()
train_features_object_group_cp_dose['train/test'] = ['test', 'test']

test_features_object_group_cp_dose = test_features_object.groupby('cp_dose').aggregate({'sig_id': 'count'}).reset_index()
test_features_object_group_cp_dose['train/test'] = ['train', 'train']

group_cp_dose = pd.concat([train_features_object_group_cp_dose, test_features_object_group_cp_dose])
group_cp_dose.head()

In [None]:
fig = px.bar(group_cp_dose, x="cp_dose", y="sig_id", color="train/test", title="cp_dose")
fig.show()

## cp_time

In [None]:
train_features_object_group_cp_time = train_features_object.groupby('cp_time').aggregate({'sig_id': 'count'}).reset_index()
train_features_object_group_cp_time['train/test'] = ['train', 'train', 'train']

test_features_object_group_cp_time = test_features_object.groupby('cp_time').aggregate({'sig_id': 'count'}).reset_index()
test_features_object_group_cp_time['train/test'] = ['test', 'test', 'test']

group_cp_time = pd.concat([train_features_object_group_cp_time, test_features_object_group_cp_time])
group_cp_time.head()

In [None]:
fig = px.bar(group_cp_time, x="cp_time", y="sig_id", color="train/test", title="cp_time")
fig.show()

# Distribution of numerical data

In [None]:
train_features_number = train_features.select_dtypes(include = ['float64', 'int64'])
test_features_number = test_features.select_dtypes(include = ['float64', 'int64'])

In [None]:
train_features_random = train_features_number[train_features_number.columns[
    np.random.randint(0, train_features_number.shape[1], 10)]]

train_features_random.hist(bins=40, figsize=(20,15))
plt.show()

# Skewed data

Computes the skewness of a data set. For normally distributed data, the skewness should be about 0. A skewness value > 0 means that there is more weight in the left tail of the distribution

In [None]:
skewed_feats = train_features_number.apply(lambda x: skew(x)) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats_index = skewed_feats.index

In [None]:
random_g_skewed_feats = train_features_number[skewed_feats_index][train_features_number[skewed_feats_index].columns[
    np.random.randint(0, train_features_number[skewed_feats_index].shape[1], 10)]]

In [None]:
random_g_skewed_feats.head()

In [None]:
sns.set(rc={'figure.figsize': (20, 10)})
plt.subplot(251)
stats.probplot(random_g_skewed_feats.iloc[:,1], dist="norm", plot=pylab)
plt.subplot (252)
stats.probplot(random_g_skewed_feats.iloc[:,2], dist="norm", plot=pylab)
plt.subplot (253)
stats.probplot(random_g_skewed_feats.iloc[:,3], dist="norm", plot=pylab)
plt.subplot (254)
stats.probplot(random_g_skewed_feats.iloc[:,4], dist="norm", plot=pylab)
plt.subplot (255)
stats.probplot(random_g_skewed_feats.iloc[:,5], dist="norm", plot=pylab)
pylab.show()

In [None]:
colu = [[columns] * len(random_g_skewed_feats.iloc[:,ind]) for ind, columns in enumerate(random_g_skewed_feats.columns)]

values_random_g_skewed_feats_sp_g = []

for columns_split_g in range(random_g_skewed_feats.shape[1]):
    values_random_g_skewed_feats_sp_g.append(random_g_skewed_feats.iloc[:,columns_split_g])
    
d = {'g-': list(itertools.chain(*values_random_g_skewed_feats_sp_g)), 'indax_g': list(itertools.chain(*colu))}
df = pd.DataFrame(data=d)

In [None]:
df.head()

# PCA

In [None]:
pca = PCA()
pca.fit(train_features_number[skewed_feats_index])
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [None]:
plt.figure(figsize=(10,7))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 400, 0, 1])
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.plot([d, d], [0, 0.95], "k:")
plt.plot([0, d], [0.95, 0.95], "k:")
plt.plot(d, 0.95, "ko")
plt.show()

# Pipeline

In [None]:
del train_features['sig_id']
del test_features['sig_id']
del train_features_object['sig_id']
del train_targets_scored['sig_id']

In [None]:
cat_attr = train_features_object.columns

num_attr = np.array(train_features_number.columns)   
num_attr = np.delete(num_attr, np.argmax(num_attr == np.array(skewed_feats_index)[:, np.newaxis], axis=1))

n_skewed_PCA = np.random.randint(0, skewed_feats_index.shape, 200)
skewed_feats_index = skewed_feats_index[n_skewed_PCA]

In [None]:
X_train, y_train, X_test, y_test = train_test_split(train_features, train_targets_scored, test_size=0.3, random_state=42)

In [None]:
X_train = pd.DataFrame(X_train, columns=train_features.columns)
y_train = pd.DataFrame(y_train, columns=train_features.columns)

In [None]:
# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values
    
    
# Create a class to skewness numerical of a data set
class Skewness_numericalSelector(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.exp2(X)

In [None]:
old_num_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(num_attr)),
        ("scaler", StandardScaler())
    ])

old_cat_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(cat_attr)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

skew_num_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(skewed_feats_index)),
        ("scalermm", MinMaxScaler()),
        ('PCA', PCA(n_components=200)),
        ('skew_scaler', Skewness_numericalSelector())
        
    ])

old_full_pipeline = FeatureUnion(transformer_list=[
        ("cat_pipeline", old_cat_pipeline),
        ("num_pipeline", old_num_pipeline),
        ("skew_pipeline", skew_num_pipeline)
    ])


In [None]:
X_train = old_full_pipeline.fit_transform(X_train)
y_train = old_full_pipeline.fit_transform(y_train)
test_features = old_full_pipeline.fit_transform(test_features)

# XGBClassifier

In [None]:
!nvidia-smi

In [None]:
param = {'n_estimators':  166,
         'learning_rate': 0.0503,
         'subsample': 0.8639,
         'max_depth': 10, 
         'colsample_bytree': 0.6522,
         'min_child_weight': 31,
         'tree_method': 'gpu_hist' 
        }

classifier = MultiOutputClassifier(estimator = XGBClassifier(**param))
classifier.fit(X_train, X_test) 

In [None]:
predictions_XGBC = classifier.predict_proba(test_features)
predictions_XGBC = np.array(predictions_XGBC)[:,:,1].T

# Model DNN

In [None]:
def ret(a):
    return  a 

In [None]:
#del model

In [None]:
model= Sequential()

model.add(Lambda(ret, input_shape = [795]))

model.add(Dense(879, activation = 'relu'))
model.add(BatchNormalization())

model.add(Dense(800, activation = 'relu'))
model.add(BatchNormalization())

model.add(Dense(700, activation = 'relu'))
model.add(BatchNormalization())

model.add(Dense(600, activation = 'relu'))
model.add(BatchNormalization())

model.add(Dense(500, activation = 'elu'))
model.add(BatchNormalization())

model.add(Dense(400, activation = 'elu'))
model.add(BatchNormalization())

model.add(Dense(300, activation = 'elu'))
model.add(BatchNormalization())

model.add(Dense(206, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', metrics = ['accuracy'], 
              optimizer = tf.keras.optimizers.Adam(learning_rate=0.001))

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)

In [None]:
model_fit = model.fit(X_train, X_test, validation_data=(y_train, y_test), epochs = 8, callbacks=[reduce_lr]) 

In [None]:
sns.set(rc={'figure.figsize': (15, 10)})
plt.plot(model_fit.history['loss'], label='train')
plt.plot(model_fit.history['val_loss'], label='test')
plt.legend()
plt.show()
plt.plot(model_fit.history['accuracy'], label='train')
plt.plot(model_fit.history['val_accuracy'], label='test')
plt.legend()
plt.show()

# predictions

In [None]:
predictions_DNN = model.predict(test_features)

In [None]:
predictions = np.mean([predictions_DNN, predictions_XGBC], axis=0)

In [None]:
sample_submission.iloc[:,1:] = predictions

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False, header=True) 