In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra|
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## EDA

In [None]:
train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
target = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

In [None]:
train.shape

In [None]:
train.head()

In [None]:
try:
    train.drop(columns=['sig_id'], inplace=True)
except Exception:
    print('Column already deleted!')

try:
    target.drop(columns=['sig_id'], inplace=True)
except Exception:
    print('Column already deleted!')

In [None]:
mask = train['cp_type'] == 'ctl_vehicle'
train.drop('cp_type', axis=1, inplace=True)
test.drop('cp_type', axis=1, inplace=True)
train = train.loc[~mask]
target = target.loc[~mask]
test = test.loc[~mask]

print(train.shape, target.shape, test.shape)

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_columns = ['cp_time','cp_dose']
cat_train = train[cat_columns]
enc = OneHotEncoder(categories='auto', drop='first')
cat_train = enc.fit_transform(cat_train).toarray()
cat_train = pd.DataFrame(cat_train)
not_cat_cols = train.columns[train.columns.isin(cat_columns) != True]
not_cat_train = train[not_cat_cols]
not_cat_train.index = cat_train.index
train_ohe = pd.concat([cat_train, not_cat_train], axis=1)

cat_test = test[cat_columns]
cat_test = enc.transform(cat_test).toarray()
cat_test = pd.DataFrame(cat_test)
not_cat_cols = train.columns[train.columns.isin(cat_columns) != True]
not_cat_test = test[not_cat_cols]
not_cat_test.index = cat_test.index
test_ohe = pd.concat([cat_test, not_cat_test], axis=1, ignore_index=True)


g_columns = not_cat_train.columns[:772]
c_columns = not_cat_train.columns[-100:]

g_not_cat_train = not_cat_train.loc[:, g_columns]
c_not_cat_train = not_cat_train.loc[:, c_columns]
g_not_cat_test = not_cat_test.loc[:, g_columns]
c_not_cat_test = not_cat_test.loc[:, c_columns]

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(16, 5))

g_not_cat_train_mean = g_not_cat_train.mean(axis=0)
g_not_cat_test_mean = g_not_cat_test.mean(axis=0)

c_not_cat_train_mean = c_not_cat_train.mean(axis=0)
c_not_cat_test_mean = c_not_cat_test.mean(axis=0)

ax[0].set_title('g-features mean values')
ax[0].scatter(np.arange(len(g_columns)), g_not_cat_train_mean, label="train")
ax[0].scatter(np.arange(len(g_columns)), g_not_cat_test_mean, label="test")
ax[0].legend()

ax[1].set_title('c-features mean values')
ax[1].scatter(np.arange(len(c_columns)), c_not_cat_train_mean, label="train")
ax[1].scatter(np.arange(len(c_columns)), c_not_cat_test_mean, label="test")
ax[1].legend()
plt.savefig('mean.png')
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(16, 5))

g_not_cat_train_mean = g_not_cat_train.mean(axis=0)
g_not_cat_test_mean = g_not_cat_test.mean(axis=0)

c_not_cat_train_mean = c_not_cat_train.mean(axis=0)
c_not_cat_test_mean = c_not_cat_test.mean(axis=0)

g_diff = g_not_cat_test_mean - g_not_cat_train_mean
ax[0].set_title('g-features mean values difference on test and train')
ax[0].scatter(np.arange(len(g_columns)), g_diff, s=15)
ax[0].axhline(y=0, c='red')

c_diff = c_not_cat_test_mean - c_not_cat_train_mean
ax[1].set_title('c-features mean values difference on test and train')
ax[1].scatter(np.arange(len(c_columns)), c_diff)
ax[1].axhline(y=0, c='red')
plt.savefig('mean_diff.png')
plt.show()

In [None]:
c_test_more_arg = (c_not_cat_train_mean < c_not_cat_test).sum(axis=1) > 0

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_not_cat_train = pca.fit_transform(c_not_cat_train)
pca_not_cat_test = pca.transform(c_not_cat_test)

fig, ax = plt.subplots(1, 2, figsize=(16,7))
ax[0].scatter(pca_not_cat_train[:,0],pca_not_cat_train[:,1], s=1, label='Train')
ax[0].scatter(pca_not_cat_test[:,0][c_test_more_arg],
            pca_not_cat_test[:,1][c_test_more_arg], s=3, label='Test with more mean')
ax[0].scatter(pca_not_cat_test[:,0][c_test_more_arg!=True],
            pca_not_cat_test[:,1][c_test_more_arg!=True], s=20, label='Test with less mean')
ax[0].set_title('PCA on c-features')
ax[0].legend()



pca = PCA(n_components=2)
pca_not_cat_train = pca.fit_transform(not_cat_train)
pca_not_cat_test = pca.transform(not_cat_test)

ax[1].scatter(pca_not_cat_train[:,0],pca_not_cat_train[:,1], s=1, label='Train')
ax[1].scatter(pca_not_cat_test[:,0][c_test_more_arg],
            pca_not_cat_test[:,1][c_test_more_arg], s=3, label='Test with more mean')
ax[1].scatter(pca_not_cat_test[:,0][c_test_more_arg!=True],
            pca_not_cat_test[:,1][c_test_more_arg!=True], s=20, label='Test with less mean')
ax[1].set_title('PCA on all features')

ax[1].legend()
plt.savefig('pca.png')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_not_cat_train = pca.fit_transform(not_cat_train)
pca_not_cat_test = pca.transform(not_cat_test)

fig, ax = plt.subplots(1, 2, figsize=(16, 5))
x, y = 12, 3.5
ax[0].scatter(pca_not_cat_train[:,0],pca_not_cat_train[:,1], s=1)
ax[0].axvline(x=x, c='red')
ax[0].axhline(y=y, c='red')
ax[1].scatter(pca_not_cat_test[:,0],pca_not_cat_test[:,1], s=1)
ax[1].axvline(x=x, c='red')
ax[1].axhline(y=y, c='red')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_not_cat_train = pca.fit_transform(c_not_cat_train)
pca_not_cat_test = pca.transform(c_not_cat_test)

fig, ax = plt.subplots(1, 2, figsize=(16, 5))
x, y = 12, 3.5
ax[0].scatter(pca_not_cat_train[:,0],pca_not_cat_train[:,1], s=1)
ax[0].axvline(x=x, c='red')
ax[0].axhline(y=y, c='red')
ax[1].scatter(pca_not_cat_test[:,0],pca_not_cat_test[:,1], s=1)
ax[1].axvline(x=x, c='red')
ax[1].axhline(y=y, c='red')
plt.show()

In [None]:
((pca_not_cat_train[:,0]>x)*(pca_not_cat_train[:,1]>y)).sum()/len(pca_not_cat_train)

In [None]:
((pca_not_cat_test[:,0]>x)*(pca_not_cat_test[:,1]>y)).sum()/len(pca_not_cat_test)

In [None]:
X, y = train_ohe, target

In [None]:
import matplotlib.pyplot as plt
mean_train = np.abs(not_cat_train).sum(axis=0)/not_cat_train.shape[0]
fig, ax = plt.subplots(figsize=(15,5))
ax.scatter(np.arange(len(mean_train)), mean_train, s=6)
plt.show()

In [None]:
import matplotlib.pyplot as plt
var_train = np.abs(not_cat_train).sum(axis=0)/not_cat_train.shape[0]
fig, ax = plt.subplots(figsize=(15,5))
ax.scatter(np.arange(len(mean_train)), mean_train, s=6)
plt.show()

In [None]:
X.head()

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10,10))
data = X['g-100']
plt.hist(data, bins=100)
plt.show()

In [None]:
import statsmodels.api as sm
import patsy as pt
import sklearn.linear_model as lm
from sklearn.metrics import log_loss

# создаем пустую модель
#skm = lm.Lasso(alpha=0.003)
skm = lm.LinearRegression()
# запускаем расчет параметров для указанных данных
scaler = StandardScaler()
scaler.fit(X)
X_n = scaler.transform(X)
skm.fit(X_n, y)
pred = skm.predict(X_n)

print(log_loss(y, pred)/y.shape[1])

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20,160))
imp = np.abs(skm.coef_).mean(axis=0)
ax.barh(np.arange(len(imp))[::-1], imp)
ax.set_yticks(np.arange(len(imp))[::-1])
ax.set_yticklabels(X.columns)
ax.set_ylim(0, len(imp))
ax.axvline(x=np.mean(imp), c='red')
plt.show()

In [None]:
imp_arg = imp.argsort()[::-1][:20]
print(X.columns[imp_arg])

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(16, 10))
corrMatrix = X[X.columns[imp_arg]].corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
X.describe()

## EDA

In [None]:
train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
target = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

mask = train.cp_type != 'ctl_vehicle'
train.drop(columns=['cp_type'], inplace=True)
train = train.loc[mask]
target = target.loc[mask]

In [None]:
train.head()

In [None]:
target.head()

In [None]:
try:
    train.drop(columns=['sig_id'], inplace=True)
except Exception:
    print('Column already deleted!')

try:
    target.drop(columns=['sig_id'], inplace=True)
except Exception:
    print('Column already deleted!')

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_columns = ['cp_dose']
cat_train = train[cat_columns]
enc = OneHotEncoder(categories='auto', drop='first')
cat_train = enc.fit_transform(cat_train).toarray()
cat_train = pd.DataFrame(cat_train)
not_cat_cols = train.columns[train.columns.isin(cat_columns) != True]
not_cat_train = train[not_cat_cols]
not_cat_train.index = cat_train.index
train_ohe = pd.concat([cat_train, not_cat_train], axis=1)

mask = test.cp_type != 'ctl_vehicle'
test.drop(columns=['cp_type'], inplace=True)
pred_ind = test[mask].index
cat_test = test.iloc[pred_ind][cat_columns]
cat_test = enc.transform(cat_test).toarray()
cat_test = pd.DataFrame(cat_test)
not_cat_cols = train.columns[train.columns.isin(cat_columns) != True]
not_cat_test = test.iloc[pred_ind][not_cat_cols]
not_cat_test.index = cat_test.index
test_ohe = pd.concat([cat_test, not_cat_test], axis=1)

g_columns = not_cat_train.columns[1:772].to_list()
c_columns = not_cat_train.columns[-100:].to_list()

In [None]:
train.head()

In [None]:
g_pca = PCA(n_components=70).fit_transform(
    pd.concat((train_ohe[g_columns], test_ohe[g_columns])).values)
g_pca_train = pd.DataFrame(g_pca[:train_ohe.shape[0]], index=train_ohe.index)
g_pca_test = pd.DataFrame(g_pca[train_ohe.shape[0]:], index=test_ohe.index)

In [None]:
c_pca = PCA(n_components=10).fit_transform(
    pd.concat((train_ohe[c_columns], test_ohe[c_columns])).values)
c_pca_train = pd.DataFrame(c_pca[:train.shape[0]], index=train_ohe.index)
c_pca_test = pd.DataFrame(c_pca[train.shape[0]:], index=test_ohe.index)

In [None]:
train_ohe.shape, c_pca_train.shape, g_pca_train.shape, 

In [None]:
train_features = pd.concat([train_ohe, g_pca_train, c_pca_train], axis=1)
test_features = pd.concat([test_ohe, g_pca_test, c_pca_test], axis=1)

In [None]:
def add_statistics(df):
    df['g_sum'] = df[g_columns].sum(axis=1)
    df['g_mean'] = df[g_columns].mean(axis=1)
    df['g_std'] = df[g_columns].std(axis=1)
    df['g_kurt'] = df[g_columns].kurtosis(axis=1)
    df['g_skew'] = df[g_columns].skew(axis=1)
    df['c_sum'] = df[c_columns].sum(axis=1)
    df['c_mean'] = df[c_columns].mean(axis=1)
    df['c_std'] = df[c_columns].std(axis=1)
    df['c_kurt'] = df[c_columns].kurtosis(axis=1)
    df['c_skew'] = df[c_columns].skew(axis=1)
    df['gc_sum'] = df[g_columns + c_columns].sum(axis=1)
    df['gc_mean'] = df[g_columns + c_columns].mean(axis=1)
    df['gc_std'] = df[g_columns + c_columns].std(axis=1)
    df['gc_kurt'] = df[g_columns + c_columns].kurtosis(axis=1)
    df['gc_skew'] = df[g_columns + c_columns].skew(axis=1)

In [None]:
add_statistics(train_features)
add_statistics(test_features)

In [None]:
train_features.head()

In [None]:
X_train_all = train_features.values
y_train_all = target.values
X_test = test_features.values

In [None]:
scaler = StandardScaler()
X_train_all = scaler.fit_transform(X_train_all)
X_test = scaler.transform(X_test)

In [None]:
X_train_all.shape, y_train_all.shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=35)

train_all_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
train_all_loader = DataLoader(train_all_dataset, batch_size=128)

train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
val_dataset = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).float())

train_loader = DataLoader(train_dataset, batch_size=128)
val_loader = DataLoader(val_dataset, batch_size=128)

In [None]:
x, y = next(iter(train_loader))
x.shape, y.shape

### FNN Model

In [None]:
class FFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        
        self.bn1 = nn.BatchNorm1d(input_size)
        self.dropout1 = nn.Dropout(0.2)
        self.l1 = nn.utils.weight_norm(nn.Linear(input_size, 2048))
        self.bn2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.5)
        self.l2 = nn.utils.weight_norm(nn.Linear(2048, 1024))
        self.bn3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.5)
        self.l3 = nn.utils.weight_norm(nn.Linear(1024, output_size))
    
    def forward(self, x):
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.elu(self.l1(x))
        
        x = self.bn2(x)
        x = self.dropout2(x)
        x = F.elu(self.l2(x))
        
        x = self.bn3(x)
        x = self.dropout3(x)
        x = torch.sigmoid(self.l3(x))
        
        return x

In [None]:
model = FFNN(969, 206)

In [None]:
model(x).shape

In [None]:
def train_model(model, optimizer, loss_function, train_loader, 
                val_loader=None, scheduler=None, epochs=1):

    for epoch in range(epochs):
        running_loss = 0.0
        for n_iter, (x, y) in enumerate(train_loader):
            model.train()
            x = x.to(device)
            y = y.to(device) 
            optimizer.zero_grad()
            y_pred = model(x)
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()      
            running_loss += loss.item()
        running_loss /= len(train_loader)   
        
        if val_loader is not None:
            model.eval()  
            loss = 0.0
            with torch.no_grad():
                for (x, y) in val_loader:
                    x = x.to(device)
                    y = y.to(device) 
                    y_pred = model(x)
                    loss += loss_function(y_pred, y).item()
                loss /= len(val_loader)

            print("Epoch: [{}/{}] ".format(epoch + 1, epochs),
                  "Train loss: {:.6f}".format(running_loss),
                  "Val loss: {:.6f} ".format(loss))
        else:
            print("Epoch: [{}/{}] ".format(epoch + 1, epochs),
                  "Train loss: {:.6f}".format(running_loss))
        if scheduler is not None:
            scheduler.step()     

In [None]:
device = 'cuda'
loss_function = nn.BCELoss()
model = FFNN(969, 206).to(device)
optimizer = optim.Adam(lr=0.001, params=model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer,10, gamma=0.5, last_epoch=-1)

In [None]:
train_model(model, optimizer, loss_function, train_loader, val_loader, epochs=50, scheduler=scheduler)

In [None]:
loss_function = nn.BCELoss()
model = FFNN(969, 206).to(device)
optimizer = optim.Adam(lr=0.001, params=model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer,10, gamma=0.5, last_epoch=-1)
train_model(model, optimizer, loss_function, train_all_loader, epochs=50, scheduler=scheduler)

In [None]:
def predict(model, X):
    model.eval()  
            
    with torch.no_grad():
        X = X.to(device)
        preds = model(X)
#         y_pred = torch.sigmoid(preds)
    return preds.cpu().numpy()

In [None]:
y_pred = predict(model, torch.tensor(X_test).float())

In [None]:
submission = pd.DataFrame(np.zeros((test.shape[0], target.shape[1])),
                         index=test.index, columns=target.columns)
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
submission.iloc[pred_ind] = predict(model, torch.tensor(X_test).float())
submission = pd.concat([test[['sig_id']], submission], axis=1)
submission

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)