# Experimental Scaler testing model

---

## General housekeeping notes!!!

- There might be duplicate ss_lr variables from when the ss_svc variables were changed.
- Put the Google drive mount lines at the top so I don't have to wait for RAPIDS to install first...

# Setting up the Environment

---

## Environment Sanity Check
NOTE: This isn't necessary if just using sklearn and not cupy or cudf

Go to `Edit` > `Notebook Settings` > `Hardware Accelerator` and select `GPU` if not already selected

Check output of `!nvidia-smi` to make sure you've been allocated a Tesla T4, P4, or P100.

In [3]:
# !nvidia-smi

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Uncomment all this is deciding to implement cuml,cupy,cudf etc.

# # Install RAPIDS
# # Look at RAPIDS notebooks to see examples: https://github.com/rapidsai/notebooks

# !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
# !bash rapidsai-csp-utils/colab/rapids-colab.sh stable

# import sys, os

# dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
# sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
# sys.path
# exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [6]:
#This cell is what should be used to import the rapids library on kaggle instead of colab

# import warnings, sys
# warnings.filterwarnings("ignore")

# !cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
# !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
# !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [7]:
import io, requests

import datetime
import pandas as pd
#import cudf
#import cuml
import numpy as np
#import cupy as cp
from time import time
from tqdm.notebook import tqdm

from sklearn.preprocessing import power_transform
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# Loading / Processing the Data

---

## Loading the data and some initial preprocessing

In [8]:
def dummy_vars(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    # trt_cp is a control and has no MOA, ctl_vehicle indicates a real test
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    # create columns ['cp_type_24', 'cp_type_48', 'cp_type_72'] which are one hot encoded based on cp_time
    df['cp_type_24'] = df['cp_time'].astype(str) == '24'
    df['cp_type_48'] = df['cp_time'].astype(str) == '48'
    df['cp_type_72'] = df['cp_time'].astype(str) == '72'
    df.drop(columns=['sig_id','cp_time'], inplace = True)
    return df

In [9]:
#########
# TO DO #
#################################################################
# after the yeo-johnson transformation on the numerical data    #
# one hot encode the categorical features                       #
# recombine the data so and run standarscaler                   #
#################################################################
# It looks like the yeo johnson transform is done column-wise   #
# Put the data in two long lists for the g- and c- features     #
# Perform yeo johnson on each                                   #
# put the data back into the df                                 #
#################################################################

# Train Features
train_features = pd.read_csv('drive/My Drive/Metis/train_features.csv')
train_features = dummy_vars(train_features)
print('train_features loaded')

# Train Targets
train_targets = pd.read_csv('drive/My Drive/Metis/train_targets_scored.csv')
train_targets.drop(columns=['sig_id'], inplace = True)
print('train_targets loaded')

# Test Features
test_features = pd.read_csv('drive/My Drive/Metis/test_features.csv')
test_features = dummy_vars(test_features)
#test_features = pd.DataFrame(power_transform(test_features, method='yeo-johnson'))
print('test_features loaded')

# Sample Submission DF (Targets)
ss_lr = pd.read_csv('drive/My Drive/Metis/sample_submission.csv')
ss_lr.drop(columns=['sig_id'], inplace = True)
print('ss_lr loaded')

print('done!')

train_features loaded
train_targets loaded
test_features loaded
ss_lr loaded
done!


## Defining the Logistic Regression Model

In [10]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

Logistic Regression

In [11]:
def k_fold_log_reg(N_SPLITS, model_train_num, column, X_new, x_tt_new, target, res_lr):
  for rand_state in range(model_train_num):
  # train the model for each one of the random states 
    skf = StratifiedKFold(n_splits = N_SPLITS, random_state = rand_state, shuffle = True)

    for (train, val) in skf.split(target, target):
    # for each of the train/val splits...

      x_train = X_new[train, column].reshape(-1, 1).astype(float)
      y_train = target[train].astype(float)

      x_val = X_new[val, column].reshape(-1, 1).astype(float)
      y_val = target[val].astype(float)

      model = LogisticRegression(C = 35, max_iter = 1000)
      # C is the inverse of the regularization parameter.
      # Play with different C values to determine how strongly the model should be fit between test/val iterations
      # look at https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
      # tune the hyper parameters. Maybe class_weight = balanced
      # also maybe change the solver
      model.fit(x_train, y_train)

      ss_lr.loc[:, train_targets.columns[column]] += (
              model.predict_proba(
                x_tt_new[:, column].reshape(-1, 1).astype(float))[:, 1]
              / (N_SPLITS * model_train_num))
                  
      res_lr.loc[val, train_targets.columns[column]] += (
              model.predict_proba(x_val)[:, 1].astype(float) / model_train_num)




def logreg(train_targets, X, x_test, ss_lr):
# train_targets is the training targets
# X is the train features
# x_test is the holdout set of features
# ss_lr ia blank since it's the empty sample submission predictions

  # cols = [c for c in ss_lr.columns.values]
  cols = ss_lr.columns.values

  res_lr = train_targets.copy()
  X_new = res_lr[cols].values
  x_tt_new = ss_lr[cols].values

  ss_lr.loc[:, train_targets.columns] = 0
  res_lr.loc[:, train_targets.columns] = 0

  lrg_target_sum = 0
  sml_target_sum = 0
  one_target_sum = 0

  model_train_num = 5 # number of random states the model is trained on for each target

  for column in tqdm(range(train_targets.shape[1])):
  # create a model for each of the targets
      start_time = time()
      target = train_targets.values[:, column]

      N_SPLITS = 5 # number of k-fold splits

      if target.sum() >= N_SPLITS:
        # split if there's enough positive target values 
        lrg_target_sum += 1
        k_fold_log_reg(N_SPLITS, model_train_num, column, X_new, x_tt_new, target, res_lr)
            
      elif target.sum() == 1:
        # only 1 target, just run the model with only one positive val
        # might want to eliminate the cp_type == 1 values from train and test since this isn't the most useful training set anyways
        one_target_sum += 1
        x_train = X_new[:, column].reshape(-1, 1).astype(float)
        y_train = target.astype(float)
        model = LogisticRegression(C = 35, max_iter = 1000)
        model.fit(x_train, y_train)

        ss_lr.loc[:, train_targets.columns[column]] = (
              model.predict_proba(
                x_tt_new[:, column].reshape(-1, 1).astype(float))[:, 1])
        
        res_lr.loc[:, train_targets.columns[column]] = (
              model.predict_proba(x_train)[:, 1].astype(float))

      else:
        # if target.sum() is less than N_SPLITS and also not 1
        sml_target_sum += 1
        k_fold_log_reg(target.sum(), model_train_num, column, X_new, x_tt_new, target, res_lr)

      score = log_loss(
        train_targets.loc[:, train_targets.columns[column]],
        res_lr.loc[:, train_targets.columns[column]]
        )
      feat_time = str(datetime.timedelta(seconds = time() - start_time))[2:7]
      print(f'[{feat_time}] LR Target {column}:', score)

  print(f'Log Loss: {log_loss_metric(train_targets, res_lr)}')

  # these lines automatically set the target values equal to zero if it cp_type = 1 AKA control group  
  res_lr.loc[train_features['cp_type'] == 1, train_targets.columns] = 0
  ss_lr.loc[train_features['cp_type'] == 1, train_targets.columns] = 0
  print(f'After manually eliminating control groups...: {log_loss_metric(train_targets, res_lr)}')
  print(f'lrg_target_sum = {lrg_target_sum}\nsml_target_sum = {sml_target_sum}\none_target_sum = {one_target_sum}')


## Scaling the data

In [None]:
scaler = StandardScaler()

#X = scaler.fit_transform(train_features.values[:, top_feats])
X = scaler.fit_transform(train_features.values[:, :])

#x_tt = scaler.transform(test_features.values[:, top_feats])
x_test = scaler.fit_transform(test_features.values[:,:])

#This is for the sample_submission.csv df
ss_lr.loc[:, train_targets.columns] = 0

logreg(train_targets, X, x_test, ss_lr)

HBox(children=(FloatProgress(value=0.0, max=206.0), HTML(value='')))

[00:01] LR Target 0: 4.315216166188839e-05
[00:01] LR Target 1: 4.327300235856571e-05
[00:01] LR Target 2: 4.390500840169474e-05
[00:01] LR Target 3: 4.918561533471453e-05
[00:01] LR Target 4: 5.040192469529254e-05
[00:01] LR Target 5: 4.669035611176199e-05
[00:01] LR Target 6: 4.592004887870977e-05
[00:01] LR Target 7: 4.73912919309522e-05
[00:01] LR Target 8: 4.2458308992846134e-05
[00:01] LR Target 9: 5.0109929311874894e-05
[00:01] LR Target 10: 5.086491362538759e-05
[00:01] LR Target 11: 4.643101843490358e-05
[00:01] LR Target 12: 4.177776375731857e-05
[00:01] LR Target 13: 4.5289663159021864e-05
[00:01] LR Target 14: 4.245830899283303e-05
[00:01] LR Target 15: 4.245830899282031e-05
[00:01] LR Target 16: 4.562535173147644e-05
[00:01] LR Target 17: 4.7206210581132686e-05
[00:01] LR Target 18: 4.6925711798647424e-05
[00:01] LR Target 19: 4.4897934700839555e-05
[00:01] LR Target 20: 4.4975790835622455e-05
[00:01] LR Target 21: 4.669035611176026e-05
[00:01] LR Target 22: 4.145246705887