In [1]:
%%writefile inference.py


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from cuml.svm import SVC

#from tqdm.notebook import tqdm
from tqdm import tqdm

Writing inference.py


# Load data

In [2]:
%%writefile -a inference.py


train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

Appending to inference.py


# Preprocess data

In [3]:
%%writefile -a inference.py


def preprocess_features(df):
    df = df.copy()
    df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
    df['cp_type'] = df['cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    del df['sig_id']
    return df
    
train_features = preprocess_features(train_features)
test_features = preprocess_features(test_features)
del train_targets['sig_id']

Appending to inference.py


# Train model

In [4]:
%%writefile -a inference.py

scaler = StandardScaler()
X = scaler.fit_transform(train_features)
X_test = scaler.transform(test_features)

Appending to inference.py


In [5]:
%%writefile -a inference.py

target_columns = train_targets.columns

val_predictions = train_targets.copy()
val_predictions.loc[:, target_columns] = 0

test_predictions = pd.read_csv('../input/lish-moa/sample_submission.csv')
test_predictions.loc[:, target_columns] = 0

FOLDS = 5
SEEDS = 3

progress_bar = tqdm(range(train_targets.shape[1]))
for target_column_idx in progress_bar:
    
    train_targets_column = train_targets.values[:, target_column_idx]
    
    if train_targets_column.sum() >= FOLDS: # If target column has less non-zero labels than folds, just skip the column
        
        for seed in range(SEEDS):

            skf = StratifiedKFold(n_splits = FOLDS, random_state = seed, shuffle = True)
            for n, (train_idx, val_idx) in enumerate(skf.split(train_targets_column, train_targets_column)):

                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = train_targets_column[train_idx], train_targets_column[val_idx]

                model = SVC(C = 40, cache_size = 2000)
                model.fit(X_train, y_train)

                val_predictions.loc[val_idx, train_targets.columns[target_column_idx]] += model.decision_function(X_val) / SEEDS
                test_predictions.loc[:, train_targets.columns[target_column_idx]] += model.decision_function(X_test) / (FOLDS*SEEDS)
        
    score = log_loss(train_targets.loc[:, train_targets.columns[target_column_idx]], val_predictions.loc[:, train_targets.columns[target_column_idx]])
    progress_bar.set_description(f"Processing {target_column_idx+1} score: {score:.4f}")
    #print(f"{target_column_idx+1} Target column {train_targets.columns[target_column_idx]} validation score: {score:.7f}")
    

Appending to inference.py


In [6]:
%%writefile -a inference.py


def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

Appending to inference.py


In [7]:
%%writefile -a inference.py

print(f'SVM OOF before postprocessing: {log_loss_metric(train_targets, val_predictions)}')
val_predictions.loc[train_features['cp_type'] == 1, train_targets.columns] = 0
test_predictions.loc[test_features['cp_type'] == 1, train_targets.columns] = 0
print(f'SVM OOF after postprocessing: {log_loss_metric(train_targets, val_predictions)}')

Appending to inference.py


In [8]:
%%writefile -a inference.py

cols = [c for c in test_predictions.columns.values if c != 'sig_id'] # todo: do we need this???
X_val_p = val_predictions[cols].values 
X_test_p = test_predictions[cols].values

reg_test_pred = pd.read_csv('../input/lish-moa/sample_submission.csv')
reg_test_pred.loc[:, train_targets.columns] = 0
reg_val_pred = train_targets.copy()
reg_val_pred.loc[:, train_targets.columns] = 0

Appending to inference.py


In [9]:
%%writefile -a inference.py

FOLDS = 5
SEEDS = 3

progress_bar = tqdm(range(train_targets.shape[1]))
for target_column_id in progress_bar:
    target_values = train_targets.values[:, target_column_id]
    
    if target_values.sum() >= FOLDS:
        for seed in range(SEEDS):
            skf = StratifiedKFold(n_splits = FOLDS, random_state = seed, shuffle = True)
            for n, (train_idx, val_idx) in enumerate(skf.split(target_values, target_values)):

                X_train, X_val = X_val_p[train_idx, target_column_id].reshape(-1, 1), X_val_p[val_idx, target_column_id].reshape(-1, 1)
                y_train, y_val = target_values[train_idx], target_values[val_idx]

                model = LogisticRegression(C = 35, max_iter = 1000)
                model.fit(X_train, y_train)

                reg_val_pred.loc[val_idx, train_targets.columns[target_column_id]] += model.predict_proba(X_val)[:, 1] / SEEDS
                reg_test_pred.loc[:, train_targets.columns[target_column_id]] += model.predict_proba(X_test_p[:, target_column_id].reshape(-1, 1))[:, 1] / (FOLDS*SEEDS)
            
    score = log_loss(train_targets.loc[:, train_targets.columns[target_column_id]], reg_val_pred.loc[:, train_targets.columns[target_column_id]])
    progress_bar.set_description(f"Processing {target_column_id+1} score: {score:.4f}")

Appending to inference.py


In [10]:
%%writefile -a inference.py

print(f'LR OOF before postprocessing: {log_loss_metric(train_targets, reg_val_pred)}')
reg_val_pred.loc[train_features['cp_type'] == 1, train_targets.columns] = 0
reg_test_pred.loc[test_features['cp_type'] == 1, train_targets.columns] = 0
print(f'LR OOF after postprocessing: {log_loss_metric(train_targets, reg_val_pred)}')

Appending to inference.py


# Create submission file

In [11]:
%%writefile -a inference.py

reg_test_pred.to_csv('submission.csv', index=False)
np.save('svm-oof.npy', reg_val_pred.values)

Appending to inference.py


In [12]:
! python inference.py

Processing 206 score: 0.0435: 100%|███████████| 206/206 [41:10<00:00, 11.99s/it]
SVM OOF before postprocessing: 0.09469813448888163
SVM OOF after postprocessing: 0.0946980483983166
Processing 206 score: 0.0097: 100%|███████████| 206/206 [01:47<00:00,  1.91it/s]
LR OOF before postprocessing: 0.017923020668368515
LR OOF after postprocessing: 0.017054487297791708
[0m