In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk

pd.set_option('display.max_rows', None) # Show max rows/columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
!pip install iterative-stratification

In [None]:
!pip install pytest

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Table of Contents
<a id="top"></a>

1.	[Load Data](#load_data)
2.	[EDA](#eda)
    + 2.1 [Target](#target)
    + 2.2 [Features](#features)
        + 2.2.1 [Categorical Features](#cat_feat)
        + 2.2.2 [Numeric Features](#num_feat)
3.	[Code Test](#test)
4.  [OVR](#ovr)
    + 4.1 [Test a model](#test1)

# 1. Load Data
<a id="load_data"></a>
<a href="#top">Back to top</a>

In [None]:
X = pd.read_csv('../input/train_features.csv')
print(X.shape)
X.head()

In [None]:
y = pd.read_csv('../input/train_targets_scored.csv')
print(y.shape)
y.head()

In [None]:
X_test = pd.read_csv('../input/test_features.csv')
print(X_test.shape)
X_test.head()

In [None]:
y_test = pd.read_csv('../input/sample_submission.csv')
print(y_test.shape)
y_test.head()

# 2. EDA
<a id="eda"></a>
<a href="#top">Back to top</a>

23,814 examples, 875 features (excluding the unique identifier), 206 class labels

In [None]:
X.shape, y.shape

In [None]:
df = pd.concat([X, y], axis=1).head()
df

In [None]:
#...
train_features = pd.read_csv('../input/train_features.csv')
train_labels_ohe = pd.read_csv('../input/train_targets_scored.csv')

In [None]:
train_features.drop(df.columns[0], axis=1, inplace=True)

In [None]:
# Reverse the OHE labels
y = train_labels_ohe.iloc[:,1:].idxmax(axis=1)
y = pd.DataFrame(y)
y.columns = ['target']

In [None]:
df = pd.concat([train_features, y], axis=1) # Recombine into single df

## 2.1 Target
<a id="target"></a>
<a href="#top">Back to top</a>

In [None]:
# Reference: https://stackoverflow.com/questions/38334296/reversing-one-hot-encoding-in-pandas
y_target_col = y.iloc[:,1:].idxmax(axis=1) # Reverse the OHE labels
y_target_col = pd.DataFrame(y_target_col)
y_target_col.columns = ['label']

In [None]:
y_target_col.head()

In [None]:
y_value_counts = y_target_col.value_counts()
y_value_counts.head()

The frequency plot of the 206 class labels shows that it's highly imbalanced, with a large skew towards "5-alpha_reductase_inhibitor."

In [None]:
# Reference: https://stackoverflow.com/questions/46623583/seaborn-countplot-order-categories-by-count
plt.figure(figsize=(100,20))
ax = sns.countplot(x="label",
                   data=y_target_col,
                   order = y_target_col['label'].value_counts().index)
_ = ax.set(xlabel="Mechanism of Action (MoA)", ylabel = "Frequency")
_ = ax.set_title('Frequency Histogram of MoA')
# Reference: https://www.drawingfromdata.com/how-to-rotate-axis-labels-in-seaborn-and-matplotlib
_ = plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',
    fontsize='x-large'  
)

The different labels all have various frequencies. The dataset is obviously quite imbalanced. The next step is to create a frequency plot. It will be split amongst two groups: <100 and between 100;1,000

In [None]:
y_value_counts_under100 = y_value_counts[y_value_counts < 100]
y_value_counts_under100 = pd.DataFrame(y_value_counts_under100)
y_value_counts_under100.reset_index(level=0, inplace=True)
y_value_counts_under100 = y_value_counts_under100['label'].tolist()
y_value_counts_under100 = y_target_col[y_target_col['label'].isin(y_value_counts_under100)]

In [None]:
# Reference: https://stackoverflow.com/questions/46623583/seaborn-countplot-order-categories-by-count
plt.figure(figsize=(150,50))
ax = sns.countplot(x="label",
                   data=y_value_counts_under100,
                   order = y_value_counts_under100['label'].value_counts().index)
_ = ax.set(xlabel="Mechanism of Action (MoA)", ylabel = "Frequency")
_ = ax.set_title('Frequency Histogram of MoA (Count < 100)')
# Reference: https://www.drawingfromdata.com/how-to-rotate-axis-labels-in-seaborn-and-matplotlib
_ = plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',
    fontsize='x-large'  
)

In [None]:
y_value_counts_100to1000 = y_value_counts[(y_value_counts >= 100) & (y_value_counts < 1000)]
y_value_counts_100to1000 = pd.DataFrame(y_value_counts_100to1000)
y_value_counts_100to1000.reset_index(level=0, inplace=True)
y_value_counts_100to1000 = y_value_counts_100to1000['label'].tolist()
y_value_counts_100to1000 = y_target_col[y_target_col['label'].isin(y_value_counts_100to1000)]

In [None]:
# Reference: https://stackoverflow.com/questions/46623583/seaborn-countplot-order-categories-by-count
plt.figure(figsize=(10,8))
ax = sns.countplot(x="label",
                   data=y_value_counts_100to1000,
                   order = y_value_counts_100to1000['label'].value_counts().index)
_ = ax.set(xlabel="Mechanism of Action (MoA)", ylabel = "Frequency")
_ = ax.set_title('Frequency Histogram of MoA (100 <= Count < 1,000)')
# Reference: https://www.drawingfromdata.com/how-to-rotate-axis-labels-in-seaborn-and-matplotlib
_ = plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light',
    fontsize='x-large'  
)

## 2.2 Features
<a id="features"></a>
<a href="#top">Back to top</a>

In [None]:
X.head()

In [None]:
X.dtypes.head(8) # most are all numeric, only the sig_id, cp_type, and cp_dose are categorical
# cp_time can be considered categorical also

### 2.2.1 Categorical Features
<a id="cat_feat"></a>
<a href="#top">Back to top</a>

In [None]:
cp_type = X.iloc[:,1]
cp_type.value_counts()

In [None]:
_ = sns.countplot(x='cp_type', data=X)

In [None]:
cp_time = X.iloc[:,2]
cp_time.value_counts()

In [None]:
_ = sns.countplot(x='cp_time', data=X)

In [None]:
cp_dose = X.iloc[:,3]
cp_dose.value_counts()

In [None]:
_ = sns.countplot(x='cp_dose', data=X)

### 2.2.1 Numerical Features
<a id="num_feat"></a>
<a href="#top">Back to top</a>

In [None]:
X_describe = X.iloc[:,1:].describe() # statistics on the numeric attributes
X_describe

all numeric features have some variance, i.e., none are constant vectors with single values

In [None]:
min(X_describe.loc['std',:])

### Split g- and c- features

# 3. Code Test
<a id="test"></a>
<a href="#top">Back to top</a>

## Problem:
The issue is trying to figure out a working pipeline to go from the given dataset and output the predictions to kaggle. Normally, this sort of step is not required, but since we need to submit it to kaggle that means that there needs to be some small steps taken to ensure that the correct files are going back and forth.

We are given the kaggle data, which basically has the data split by feature/target and train/test. So, for the training data, there are:

23,814 examples, 875 features (excluding the unique identifier), 206 class labels (excluding the unique identifier)

For the testing data, there are:

3,982 examples, 875 features (excluding the unique identifier), 206 class labels (excluding the unique identifier)

In the typical process, the target vector is a single $n\times 1$ vector. It is possible to convert the 206 columns into a single column with the class labels as values. However, they're already in a OHE state.

Possibilities:
1. Convert to single target vector. Use the pipeline to re-OHE the class labels. Likewise, OHE the categorical variables. This process is important for using sk-learn, since the library seems to prefer having the single target vector rather than a provided OHE set of columns.
    - A difficulty with this however is that the data needs to also be split into K folds for k-fold CV.
        - Another issue is that each of the splits will not have all the classes.
        - However, it also must be such that 5-fold CV is used to tune the model, then a final test model is used based on those parameters to output a set of labels.
        
Cross-Validation stage:
- Perform 5-fold CV for a simple logistic regression model in sklearn.
- Find the tuned weights

## try a simple approach first:

In [None]:
train_features = pd.read_csv('../input/train_features.csv')
train_labels_ohe = pd.read_csv('../input/train_targets_scored.csv')

In [None]:
train_features.head()

In [None]:
train_labels_ohe.head()

In [None]:
train_features.drop(train_features.columns[0], axis=1,inplace=True)
train_labels_ohe.drop(train_labels_ohe.columns[0], axis=1,inplace=True)

In [None]:
# from sklearn import model_selection
# from sklearn import preprocessing
# from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
# Save the column names
train_features_col_names = train_features.columns.tolist()

cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns

ohe = OneHotEncoder() # Load OHE

# Get the column names after OHE
# Reference: https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
_ = ohe.fit_transform(train_features[cat_cols])
ohe_names = ohe.get_feature_names(cat_cols)
ohe_names = ohe_names.tolist()

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), list(range(0,3)))], remainder='passthrough')
train_features = ct.fit_transform(train_features)

In [None]:
train_features = pd.DataFrame(train_features)
train_features.columns = ohe_names

In [None]:
train_features.head()

In [None]:
# Reverse the OHE labels
y = train_labels_ohe.iloc[:,:].idxmax(axis=1)
y = pd.DataFrame(y)
y.columns = ['target']

In [None]:
df = pd.concat([train_features, y], axis=1) # Recombine into single df

df['kfold'] = -1 # Create k-folds column

df = df.sample(frac=1).reset_index(drop=True) # Randomize the dataset

y = df.target.values # Subset the target column

# Initialize the stratified k-fold module from sklearn
kf = StratifiedKFold(n_splits=5)

In [None]:
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

In [None]:
df.head()

In [None]:
df.iloc[:,7:].head()

## end simple approach

## test log reg

### Check for if the kfold column needs to be dropped.

In [None]:
# logres.py
import os

import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import config

fold=0
# def run(fold):
# Read the data
df = pd.read_csv(config.TRAINING_FILE)

# Separate into train and validation
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

# Drop the target and and convert to numpy
x_train = df_train.drop(['target', 'kfold'], axis=1).values
y_train = df_train.target.values

# Repeat for validation data
x_valid = df_valid.drop(['target', 'kfold'], axis=1).values
y_valid = df_valid.target.values

In [None]:
# logres.py
import os

import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import config

fold=0
# def run(fold):
# Read the data
df = pd.read_csv(config.TRAINING_FILE)

# Separate into train and validation
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

# Drop the target and and convert to numpy
x_train = df_train.drop('target', axis=1).values
y_train = df_train.target.values

# Repeat for validation data
x_valid = df_valid.drop('target', axis=1).values
y_valid = df_valid.target.values

# Apply feature scaling to the numeric attributes
sc = StandardScaler()
x_train[:,7:] = sc.fit_transform(x_train[:,7:])
x_valid[:,7:] = sc.transform(x_valid[:,7:])

# Intiialize the classifier
model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

# Fit the model
model.fit(x_train, y_train)

In [None]:
# Create predictions
valid_preds = model.predict_proba(x_valid)

### Issue, the predictions are missing 2 columns 204 rather than 206. The validation set includes 203 classes, while the train set includes 204 classes.

It would make sense that the y_preds are based on the classifier that has seen a number of classes equal to the number of classes in the training set. However, in trying to find the log-loss, the number of class predictions per observation is in this case larger than the number of classes in the validation set. Therefore, the log_loss function encounters this error.

A goal now is to find the associated labels with y_pred from y_train.

In [None]:
len(np.unique(y_valid)), len(np.unique(y_train))

In [None]:
valid_preds.shape

In [None]:
valid_preds[:,0]

In [None]:
y_valid

In [None]:
# Calculate and print the accuracy
log_loss_score = metrics.log_loss(y_valid, valid_preds, labels=model.classes_)
print(f"Fold={fold}, Log-Loss={log_loss_score}")

test the labels=classes_ with iris dataset

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
X = iris.data
y = iris.target

In [None]:
l1 = list(range(0,40))
l2 = list(range(50, 90))
l3 = list(range(100,140))

In [None]:
l1.extend(l2)
l1.extend(l3)

In [None]:
X_train = X[l1,:]
y_train = y[l1]

In [None]:
l4 = list(range(40,50))
l5 = list(range(90,100))
l4.extend(l5)
X_valid = X[l4,:]
y_valid = y[l4]

In [None]:
model = linear_model.LogisticRegression()

# Fit the model
model.fit(X_train, y_train)

In [None]:
valid_preds = model.predict_proba(X_valid)

In [None]:
valid_preds.shape

In [None]:
log_loss_score = metrics.log_loss(y_valid, valid_preds)
print(f"Fold={fold}, Log-Loss={log_loss_score}")

In [None]:
log_loss_score = metrics.log_loss(y_valid, valid_preds, labels=model.classes_)
print(f"Fold={fold}, Log-Loss={log_loss_score}")

## end log reg test

train.py

In [None]:
import argparse
import os
import sys

sys.path.append('../src')
import config

import joblib
import pandas as pd
from sklearn import metrics
from sklearn import tree

# TRAINING_FILE = "../input/train_folds.csv"

# MODEL_OUTPUT = "../models/"

In [None]:
fold=0

In [None]:
df = pd.read_csv(config.TRAINING_FILE)

# Separate into train and validation
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

# Drop the target and and convert to numpy
x_train = df_train.drop('target', axis=1).values
y_train = df_train.target.values

# Repeat for validation data
x_valid = df_valid.drop('target', axis=1).values
y_valid = df_valid.target.values

In [None]:
df_train.head()

In [None]:
# Intiialize the classifier
clf = tree.DecisionTreeClassifier()

# Fit the model
clf.fit(x_train, y_train)

# Create predictions
preds = clf.predict(x_valid)

In [None]:
accuracy = metrics.accuracy_score(y_valid, preds)

In [None]:
accuracy

In [None]:
def run(fold):
    # Read the data
    df = pd.read_csv(config.TRAINING_FILE)

    # Separate into train and validation
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # Drop the target and and convert to numpy
    x_train = df_train.drop('target', axis=1).values
    y_train = df_train.target.values

    # Repeat for validation data
    x_valid = df_valid.drop('target', axis=1).values
    y_valid = df_valid.target.values

    # Intiialize the classifier
    clf = tree.DecisionTreeClassifier()

    # Fit the model
    clf.fit(x_train, y_train)

    # Create predictions
    preds = clf.predict(x_valid)

    # Calculate and print the accuracy
    accuracy = metrics.accuracy_score(y_valid, preds)
    print(f"Fold={fold}, Accuracy={accuracy}")

    # Save the model
    joblib.dump(
        clf,
        os.path.join(config.MODEL_OUTPUT, f"dt_{fold}.bin")
    )

# if __name__ == "__main__":
#     # Initialize the argparse
#     parser = argparse.ArgumentParser()

#     # Add arguments to parser
#     parser.add_argument(
#         "--fold",
#         type=int
#     )
#     args = parser.parse_args() # Read arguments from command line

#     run(fold=args.fold) # Run the folds specified in the command line

In [None]:
from sklearn import preprocessing
from sklearn.compose import make_column_transformer

In [None]:
train_features = pd.read_csv('../input/train_features.csv')
train_labels_ohe = pd.read_csv('../input/train_targets_scored.csv')

# Drop the unique key column
train_features.drop(train_features.columns[0], axis=1,inplace=True)
train_features_col_names = train_features.columns.tolist()

In [None]:
cat_cols = ['cp_type', 'cp_time', 'cp_dose']

ohe = preprocessing.OneHotEncoder()
_ = ohe.fit_transform(train_features[cat_cols])
ohe_names = ohe.get_feature_names(cat_cols)
ohe_names = ohe_names.tolist()
columns_trans = make_column_transformer(
        (preprocessing.OneHotEncoder(),
        cat_cols),
        remainder='passthrough')
train_features = columns_trans.fit_transform(train_features)

train_features_col_names = [col for col in train_features_col_names if col not in cat_cols]

ohe_names.extend(train_features_col_names)

train_features = pd.DataFrame(train_features)
train_features.columns = ohe_names

In [None]:
train_features.head()

logres.py

In [None]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
import config

In [None]:
def run(fold):
    # Read the data
    df = pd.read_csv(config.TRAINING_FILE)

    # Separate into train and validation
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # Drop the target and and convert to numpy
    x_train = df_train.drop('target', axis=1).values
    y_train = df_train.target.values

    # Repeat for validation data
    x_valid = df_valid.drop('target', axis=1).values
    y_valid = df_valid.target.values

    # Intiialize the classifier
    model = linear_model.LogisticRegression()

    # Fit the model
    model.fit(x_train, y_train)

    # Create predictions
    valid_preds = model.predict_proba(x_valid)[:, 1]

    # Calculate and print the accuracy
    auc = metrics.roc_auc_score(y_valid, valid_preds)
    print(f"Fold={fold}, AUC={auc}")

# if __name__ == "__main__":
for fold_ in range(5):
    run(fold_)

In [None]:
fold=0
# def run(fold):
# Read the data
df = pd.read_csv(config.TRAINING_FILE)

# Separate into train and validation
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

# Drop the target and and convert to numpy
x_train = df_train.drop('target', axis=1).values
y_train = df_train.target.values

# Repeat for validation data
x_valid = df_valid.drop('target', axis=1).values
y_valid = df_valid.target.values

In [None]:
# Intiialize the classifier
model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

# Fit the model
model.fit(x_train, y_train)

# Create predictions
# valid_preds = model.predict_proba(x_valid)[:, 1]

In [None]:
y_train.shape, y_valid.shape

In [None]:
len(np.unique(y_train))

In [None]:
valid_preds = model.predict_proba(x_valid)
valid_preds.shape

In [None]:
model.score(x_valid, y_valid)

In [None]:
valid_preds.shape

In [None]:
df_train.target.values

In [None]:
comp_metric = metrics.log_loss(y_valid, valid_preds)

In [None]:

# Calculate and print the accuracy
auc = metrics.roc_auc_score(y_valid, valid_preds)
print(f"Fold={fold}, AUC={auc}")

# if __name__ == "__main__":
# for fold_ in range(5):
# run(fold_)

# test sample submission

model build

In [None]:
X = pd.read_csv('../input/train_features.csv')
y = pd.read_csv('../input/train_targets_scored.csv')

# Drop the unique key column
X.drop(X.columns[0], axis=1,inplace=True)
y.drop(y.columns[0], axis=1, inplace=True)
X_col_names = X.columns.tolist()

cat_cols = ['cp_type', 'cp_time', 'cp_dose']

ohe = preprocessing.OneHotEncoder()
_ = ohe.fit_transform(X[cat_cols])
ohe_names = ohe.get_feature_names(cat_cols)
ohe_names = ohe_names.tolist()
columns_trans = make_column_transformer(
        (preprocessing.OneHotEncoder(),
        cat_cols),
        remainder='passthrough')
X = columns_trans.fit_transform(X)

X_col_names = [col for col in X_col_names if col not in cat_cols]

ohe_names.extend(X_col_names)

X = pd.DataFrame(X)
X.columns = ohe_names

# Reverse the OHE labels
y = y.idxmax(axis=1)
y = pd.DataFrame(y)
y.columns = ['target']

df = pd.concat([X, y], axis=1) # Recombine into single df

In [None]:
df.head()

In [None]:
X_train = df.drop('target', axis=1).values
y_train = df.target.values

sc = StandardScaler()
X_train[:,7:] = sc.fit_transform(X_train[:,7:])

In [None]:
model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

# Fit the model
model.fit(X_train, y_train)

In [None]:
test_preds = model.predict_proba(X_test)

In [None]:
test_preds.shape

In [None]:
log_loss_score = metrics.log_loss(y_valid, test_preds,
    labels=model.classes_)

X_test

In [None]:
X_test = pd.read_csv('../input/test_features.csv')

In [None]:
X_test.drop(X_test.columns[0], axis=1, inplace=True) # Drop the unique key column

In [None]:
X_test.head()

In [None]:
cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns

ohe = OneHotEncoder() # Load OHE

# Get the column names after OHE
# Reference: https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
_ = ohe.fit_transform(X_test[cat_cols])
ohe_names = ohe.get_feature_names(cat_cols)
ohe_names = ohe_names.tolist()

# Fix new column names to include OHE names and normal feature names
X_col_names = [col for col in X_col_names if col\
    not in cat_cols]
ohe_names.extend(X_col_names)

# Transform the data with OHE on the indices of the cat variables
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
    remainder='passthrough')
X_test = ct.fit_transform(X_test)
# X = pd.DataFrame(ct.fit_transform(X))
# X.columns = ohe_names

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
X_test[:,7:] = sc.fit_transform(X_test[:,7:])

In [None]:
model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

In [None]:
model.fit(x_train, y_train)

In [None]:
test_preds = model.predict_proba(X_test)

# 4. OVR
<a id="ovr"></a>
<a href="#top">Back to top</a>

The goal here is to output a simple baseline model that implements the OVR strategy. The problem is that the dataset is multilabel and there are many classes with a great degree of sparsity. An idea is to choose some arbitrary cutoff point (e.g. 300) to limit the number of classes to test and those to 'ignore.' The reason is that with so few examples, that it could be difficult or impossible to develop a model that can accurately predict those classes. Therefore, using $c<k$, where $c$ is the number of chosen classes and $k$ is the number of total classes, we can create an OVR approach to create $c$ groups of models that are binary classifiers.

# Start test on ovr_method.py

In [None]:
# ovr_method.py
import os
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
# import config
import copy

In [None]:
def run(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # Drop the target and and convert to numpy
    x_train = df_train.drop(['target', 'kfold'], axis=1).values
    y_train = df_train.target.values

    # Repeat for validation data
    x_valid = df_valid.drop(['target', 'kfold'], axis=1).values
    y_valid = df_valid.target.values

    # Apply feature scaling to the numeric attributes
    sc = StandardScaler()
    x_train[:,7:] = sc.fit_transform(x_train[:,7:])
    x_valid[:,7:] = sc.transform(x_valid[:,7:])

    # Intiialize the classifier
    model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

    # Fit the model
    model.fit(x_train, y_train)

    # Create predictions
    y_pred_probs = model.predict_proba(x_valid)
    y_preds = model.predict(x_valid)

    # Calculate and print the accuracy
    log_loss_score = metrics.log_loss(y_valid, y_pred_probs,
        labels=model.classes_)
    auc = metrics.roc_auc_score(y_valid, y_preds,
        labels=model.classes_)
    accuracy = metrics.accuracy_score(y_valid, y_preds)
    
    print(f"Fold={fold}, Log-Loss={log_loss_score}, AUC={auc}, Accuracy={accuracy}")

In [None]:
### Create OVR target vectors
# X = pd.read_csv('../input/train_features.csv')
y = pd.read_csv('../input/train_targets_scored.csv')

# Add hidden class
zero_class_indices = y[y.iloc[:,1:].apply(sum, axis=1) == 0].index
y['hidden_class'] = 0
y['hidden_class'].iloc[zero_class_indices] = 1

class_counts = y.iloc[:,1:].sum(axis=0)
class_counts = class_counts.sort_values(ascending=False)
class_counts_sub = class_counts.head(13)
retained_classes = class_counts_sub.index.values
y2 = y.iloc[:,1:]

### The following creates 'c' binary target vectors saved in a list: 'binary_vector_list'
class_index_list = [] # Save indices that contain the class
for c in retained_classes:
    c_indices = y2.loc[:,c][y2.loc[:,c] == 1].index.values
    class_index_list.append([c, c_indices])

binary_vector_list = []
n = len(y)
for i in class_index_list: # Loop through class/index pairs
    zeros = [0] * n
    for j in range(n): # Loop through all rows
        # Check if the index should be one instead
        if j in i[1]:
            zeros[j] = 1
    binary_vector_list.append(pd.DataFrame({i[0]: zeros}))

In [None]:
### Loop through OVR classes
for i in binary_vector_list:
    y_temp = copy.deepcopy(i)
    class_name = y_temp.columns[0]
    X = pd.read_csv('../input/train_features.csv')
    X.drop(X.columns[0], axis=1, inplace=True)
    
    # Save the column names
    X_col_names = X.columns.tolist()

    cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns

    ohe = OneHotEncoder() # Load OHE

    # Get the column names after OHE
    # Reference: https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
    _ = ohe.fit_transform(X[cat_cols])
    ohe_names = ohe.get_feature_names(cat_cols)
    ohe_names = ohe_names.tolist()

    # Fix new column names to include OHE names and normal feature names
    X_col_names = [col for col in X_col_names if col\
        not in cat_cols]
    ohe_names.extend(X_col_names)

    # Transform the data with OHE on the indices of the cat variables
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
        remainder='passthrough')
    X = pd.DataFrame(ct.fit_transform(X))
    X.columns = ohe_names
    
    y_temp.columns = ['target']
    
    df = pd.concat([X, y_temp], axis=1) # Recombine into single df

    df['kfold'] = -1 # Create k-folds column

    df = df.sample(frac=1).reset_index(drop=True) # Randomize the dataset

    y = df.target.values # Subset the target column

    # Initialize the stratified k-fold module from sklearn
    kf = StratifiedKFold(n_splits=5)

    # Fill the 'kfold' column with the assigned folds
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
        
    print(class_name)
    for fold_ in range(5):
        run(fold_)

In [None]:
fold = 4

In [None]:
class_name

In [None]:
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

# Drop the target and and convert to numpy
x_train = df_train.drop(['target', 'kfold'], axis=1).values
y_train = df_train.target.values

# Repeat for validation data
x_valid = df_valid.drop(['target', 'kfold'], axis=1).values
y_valid = df_valid.target.values

# Apply feature scaling to the numeric attributes
sc = StandardScaler()
x_train[:,7:] = sc.fit_transform(x_train[:,7:])
x_valid[:,7:] = sc.transform(x_valid[:,7:])

# Intiialize the classifier
model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

# Fit the model
model.fit(x_train, y_train)

# Create predictions
y_pred_probs = model.predict_proba(x_valid)

In [None]:
y_pred_probs.shape, nonscored_targets.shape

In [None]:
nonscored_targets = pd.read_csv("../input/train_targets_nonscored.csv")

In [None]:
nonscored_targets.head()

# End test

# Test 2

In [None]:
# ovr_method.py
import os
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
# import config
import copy

In [None]:
def preprocess_X():
    """The preprocess_X() function will do the initial preprocessing for the
        dataset features.
    """
    X = pd.read_csv("../input/train_features.csv")
    X_test = pd.read_csv("../input/test_features.csv")
    X.drop(X.columns[0], axis=1, inplace=True)
    X_test.drop(X_test.columns[0], axis=1, inplace=True)

    # Save the column names
    X_col_names = X.columns.tolist()
    cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns
    ohe = OneHotEncoder() # Load OHE
    _ = ohe.fit_transform(X[cat_cols])
    ohe_names = ohe.get_feature_names(cat_cols)
    ohe_names = ohe_names.tolist()

    # Fix new column names to include OHE names and normal feature names
    X_col_names = [col for col in X_col_names if col\
        not in cat_cols]
    ohe_names.extend(X_col_names)

    # Transform the data with OHE on the indices of the cat variables
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
        remainder='passthrough')
    X = pd.DataFrame(ct.fit_transform(X))
    X.columns = ohe_names
    ### Unsure if this is correct ct.transform()
    X_test = pd.DataFrame(ct.transform(X_test))
    X_test.columns = ohe_names

    # Apply feature scaling to the numeric attributes
    sc = StandardScaler()
    X = X.values
    X_test = X_test.values
    X[:,7:] = sc.fit_transform(X[:,7:])
    X_test[:,7:] = sc.transform(X_test[:,7:])
    
    return X, X_test

In [None]:
def generate_OVR_targets():
    """Generate the list of binary OVR target vectors that will be tested.
    """
    y = pd.read_csv("../input/train_targets_scored.csv")

    # Add hidden class
    zero_class_indices = y[y.iloc[:,1:].apply(sum, axis=1) == 0].index
    y['hidden_class'] = 0
    y['hidden_class'].iloc[zero_class_indices] = 1
    
    class_counts = y.iloc[:,1:].sum(axis=0)
    class_counts = class_counts.sort_values(ascending=False)
    
    ### Hard coded # of classes
    
    class_counts_sub = class_counts.head(13)
    retained_classes = class_counts_sub.index.values
    y2 = y.iloc[:,1:]

    ### The following creates 'c' binary target vectors saved in a list: 'binary_vector_list'
    class_index_list = [] # Save indices that contain the class
    for c in retained_classes:
        c_indices = y2.loc[:,c][y2.loc[:,c] == 1].index.values
        class_index_list.append([c, c_indices])

    binary_vector_list = []
    n = len(y)
    for i in class_index_list: # Loop through class/index pairs
        zeros = [0] * n
        for j in range(n): # Loop through all rows
            # Check if the index should be one instead
            if j in i[1]:
                zeros[j] = 1
        binary_vector_list.append(pd.DataFrame({i[0]: zeros}))

    return binary_vector_list

In [None]:
ovr_targets = generate_OVR_targets()

# Edit to use it for sample submission
nonscored_targets = pd.read_csv("../input/sample_submission.csv")
nonscored_targets.replace(0.5, 0, inplace=True)
nonscored_targets['hidden_class'] = 0

### Can OHE and standard scale the X_train first
X_train, X_test = preprocess_X()

for i in ovr_targets:
    y_temp = copy.deepcopy(i)
    class_name = y_temp.columns[0]

    # Intiialize the classifier
### Need to later check the correct model for a given feature
    model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

    # Fit the model
    model.fit(X_train, y_temp.values.ravel())

    # Create predictions
    y_pred_probs = model.predict_proba(X_test)

    # Update predicted probabilities
    nonscored_targets.loc[:,class_name] = y_pred_probs[:,1]

# Go through each row and find the column with the larget value
chosen_classes_per_row = nonscored_targets.iloc[:,1:].idxmax(axis=1)

# chosen_classes_per_row.value_counts()

# hidden_class_indices = chosen_classes_per_row[chosen_classes_per_row == 'hidden_class'].index

# Reference: https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
for index, row in nonscored_targets.iterrows():
    max_class = chosen_classes_per_row[index] # Subset the selected class
    row[row.index.isin([max_class, 'sig_id']) == False] = 0
    nonscored_targets.iloc[index,:] = row

# nonscored_targets.to_csv(config.OUTPUT_FILE, index=False)
# the hidden_class seems to have dominated the probabilities
nonscored_targets.drop(['hidden_class'], axis=1, inplace=True) # drop the hidden_class column

In [None]:
nonscored_targets.head()

# End Test 2

In [None]:
y.head()

In [None]:
class_counts = y.iloc[:,1:].sum(axis=0)
class_counts.head()

In [None]:
class_counts = class_counts.sort_values(ascending=False)
class_counts_sub = class_counts.head(12)
class_counts_sub

In [None]:
retained_classes = class_counts_sub.index.values

In [None]:
retained_classes

Create the $c=12$ target vectors. For each of the $c$ classes, there should be a binary vector that is '1' if a subject is positive for that class and '0' otherwise.

In [None]:
y2 = y.iloc[:,1:]
y2.head() # create a binary target vector for each class, it should check if there are any

In [None]:
### The following creates 'c' binary target vectors saved in a list: 'binary_vector_list'
class_index_list = [] # Save indices that contain the class
for c in retained_classes:
    c_indices = y2.loc[:,c][y2.loc[:,c] == 1].index.values
    class_index_list.append([c, c_indices])

binary_vector_list = []
n = len(y)
for i in class_index_list: # Loop through class/index pairs
    zeros = [0] * n
    for j in range(n): # Loop through all rows
        # Check if the index should be one instead
        if j in i[1]:
            zeros[j] = 1
    binary_vector_list.append(pd.DataFrame({i[0]: zeros}))

In [None]:
len(binary_vector_list)

# ovr_output.py

In [None]:
def preprocess_X():
    """The preprocess_X() function will do the initial preprocessing for the
        dataset features.
    """
    X = pd.read_csv("../input/train_features.csv")
    X_test = pd.read_csv("../input/test_features.csv")
    X.drop(X.columns[0], axis=1, inplace=True)
    X_test.drop(X_test.columns[0], axis=1, inplace=True)

    # Save the column names
    X_col_names = X.columns.tolist()
    cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns
    ohe = OneHotEncoder() # Load OHE
    _ = ohe.fit_transform(X[cat_cols])
    ohe_names = ohe.get_feature_names(cat_cols)
    ohe_names = ohe_names.tolist()

    # Fix new column names to include OHE names and normal feature names
    X_col_names = [col for col in X_col_names if col\
        not in cat_cols]
    ohe_names.extend(X_col_names)

    # Transform the data with OHE on the indices of the cat variables
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
        remainder='passthrough')
    X = pd.DataFrame(ct.fit_transform(X))
    X.columns = ohe_names
    ### Unsure if this is correct ct.transform()
    X_test = pd.DataFrame(ct.transform(X_test))
    X_test.columns = ohe_names

    # Apply feature scaling to the numeric attributes
    sc = StandardScaler()
    X = X.values
    X_test = X_test.values
    X[:,7:] = sc.fit_transform(X[:,7:])
    X_test[:,7:] = sc.transform(X_test[:,7:])
    
    return X, X_test

In [None]:
def generate_OVR_targets():
    """Generate the list of binary OVR target vectors that will be tested.
    """
    y = pd.read_csv("../input/train_targets_scored.csv")
    class_counts = y.iloc[:,1:].sum(axis=0)
    class_counts = class_counts.sort_values(ascending=False)
    
    ### Hard coded # of classes

    class_counts_sub = class_counts.head(12)
    retained_classes = class_counts_sub.index.values
    y2 = y.iloc[:,1:]

    ### The following creates 'c' binary target vectors saved in a list: 'binary_vector_list'
    class_index_list = [] # Save indices that contain the class
    for c in retained_classes:
        c_indices = y2.loc[:,c][y2.loc[:,c] == 1].index.values
        class_index_list.append([c, c_indices])

    binary_vector_list = []
    n = len(y)
    for i in class_index_list: # Loop through class/index pairs
        zeros = [0] * n
        for j in range(n): # Loop through all rows
            # Check if the index should be one instead
            if j in i[1]:
                zeros[j] = 1
        binary_vector_list.append(pd.DataFrame({i[0]: zeros}))

    return binary_vector_list

In [None]:
ovr_targets = generate_OVR_targets()

# Edit to use it for sample submission
nonscored_targets = pd.read_csv("../input/sample_submission.csv")
nonscored_targets.replace(0.5, 0, inplace=True)

In [None]:
### Can OHE and standard scale the X_train first
X_train, X_test = preprocess_X()

In [None]:
for i in ovr_targets:
    y_temp = copy.deepcopy(i)
    class_name = y_temp.columns[0]

    # Intiialize the classifier
### Need to later check the correct model for a given feature
    model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

    # Fit the model
    model.fit(X_train, y_temp.values.ravel())

    # Create predictions
    y_pred_probs = model.predict_proba(X_test)

    # Update predicted probabilities
    nonscored_targets.loc[:,class_name] = y_pred_probs[:,0]

In [None]:
# Go through each row and find the column with the larget value
chosen_classes_per_row = nonscored_targets.iloc[:,1:].idxmax(axis=1)

In [None]:
# Reference: https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
for index, row in nonscored_targets.iterrows():
    max_class = chosen_classes_per_row[index] # Subset the selected class
    row[row.index.isin([max_class, 'sig_id']) == False] = 0
    nonscored_targets.iloc[index,:] = row

In [None]:
nonscored_targets.to_csv('../output/submission.csv', index=False)

In [None]:
submission_csv = pd.read_csv('../output/submission.csv')

In [None]:
submission_csv.head()

# end ovr_output.py

# Cross-Validation

## ovr_cv.py

first create a simple output so that the log-loss can be developed

- CV process
    - the preprocessing is tricky here, we need to keep the correct pieces consistent + independent
        - We have X_train and y_train, these we will need to split into k-folds.
        - The OHE can be done globally. Simple transformations like x1^2 or log() can be done globally.
        - Scaling of the data however needs to be done independently, first on the training folds and then applied successively to the validation folds.
    - Issues:
        - find the stratified k-fold for the multilabel data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import copy

In [None]:
def preprocess_data():
    """The preprocess_X() function will do the initial preprocessing for the
        dataset features.
    """
    X = pd.read_csv("../input/train_features.csv")
    y = pd.read_csv("../input/train_targets_scored.csv")

    # Add hidden class
    zero_class_indices = y[y.iloc[:,1:].apply(sum, axis=1) == 0].index
    y['hidden_class'] = 0
    y.loc[zero_class_indices, 'hidden_class'] = 1
#     y['hidden_class'].iloc[zero_class_indices] = 1

    class_counts = y.iloc[:,1:].sum(axis=0)
    class_counts = class_counts.sort_values(ascending=False)
    
    ### Hard coded # of classes
    
    class_counts_sub = class_counts.head(13)
    chosen_classes = class_counts_sub.index.values
    
    X.drop(X.columns[0], axis=1, inplace=True)

    # Save the column names
    X_col_names = X.columns.tolist()
    cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns
    ohe = OneHotEncoder() # Load OHE
    _ = ohe.fit_transform(X[cat_cols])
    ohe_names = ohe.get_feature_names(cat_cols)
    ohe_names = ohe_names.tolist()

    # Fix new column names to include OHE names and normal feature names
    X_col_names = [col for col in X_col_names if col\
        not in cat_cols]
    ohe_names.extend(X_col_names)

    # Transform the data with OHE on the indices of the cat variables
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
        remainder='passthrough')
    X = pd.DataFrame(ct.fit_transform(X))
    X.columns = ohe_names

    # Reverse the OHE labels
    y = y.iloc[:, 1:].idxmax(axis=1)
    y = pd.DataFrame(y)
    y.columns = ["target"]

    df = pd.concat([X, y], axis=1)  # Recombine into single df

    df["kfold"] = -1  # Create k-folds column

    df = df.sample(frac=1).reset_index(drop=True)  # Randomize the dataset

    y = df.target.values  # Subset the target column

    # Initialize the stratified k-fold module from sklearn
    kf = StratifiedKFold(n_splits=5)

    # Fill the 'kfold' column with the assigned folds
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, "kfold"] = f

    return df, chosen_classes

In [None]:
df, chosen_classes = preprocess_data()

In [None]:
def y_arr_to_df(y_arr):
    """Change the y array into a y dataframe.
    """
    y_df_template = pd.read_csv('../input/sample_submission.csv')
    y_df_template.replace(0.5, 0, inplace=True)
    y_df_template['hidden_class'] = 0
    y_df_template.drop(['sig_id'], axis=1, inplace=True)

    n_rows = y_arr.shape[0]
    m_rows = y_df_template.shape[0]
    new_rows = n_rows - m_rows
    
    keys = y_df_template.columns
    values = [[0] * new_rows] * len(y_df_template.columns)
    extra_rows_dict = dict(zip(keys, values))
    extra_rows_df = pd.DataFrame(extra_rows_dict)

    y_df_template = y_df_template.append(extra_rows_df, ignore_index=True)

    for col_idx in range(len(y_arr)):
        y_df_template.loc[col_idx, y_arr[col_idx]] = 1
        
    return y_df_template

In [None]:
def binary_vector_fun(y_df, chosen_classes):
    """Create a OVR binary vector for a set of chosen classes.
    """
    ### The following creates 'c' binary target vectors saved in a list: 'binary_vector_list'
    class_index_list = [] # Save indices that contain the class
    for c in chosen_classes:
        c_indices = y_df.loc[:,c][y_df.loc[:,c] == 1].index.values
        class_index_list.append([c, c_indices])

    binary_vector_list = []
    n = y_df.shape[0]
    for i in class_index_list: # Loop through class/index pairs
        zeros = [0] * n
        for j in range(n): # Loop through all rows
            # Check if the index should be one instead
            if j in i[1]:
                zeros[j] = 1
        binary_vector_list.append(pd.DataFrame({i[0]: zeros}))

    return binary_vector_list

In [None]:
def multilabel_log_loss(y_valid, y_pred):
    """Calculate the log-loss for the multilabel case.
    """
    N, M = y_valid.shape # Create temp matrix to store values
    zero_mat = np.zeros((N,M))

    dummy_zero = 1*10**(-15) # Compensate for 0's and 1's predictions
    y_pred.replace(0, dummy_zero, inplace=True)
    y_pred.replace(1, 1-dummy_zero, inplace=True)

    for m in range(M): # Calculate log-loss per index
        for n in range(N):
            y_true = y_valid.iloc[n,m]
            y_hat = y_pred.iloc[n,m]
            temp_log_loss = y_true * np.log(y_hat) +\
                (1 - y_true) * np.log(1 - y_hat)            
            zero_mat[n,m] = temp_log_loss

    log_loss_score = -zero_mat.mean(axis=0).mean()
    
    return log_loss_score

In [None]:
def run_cv(fold, chosen_classes=chosen_classes):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # Drop the target and and convert to numpy
    x_train = df_train.drop(["target", "kfold"], axis=1).values
    y_train = df_train.target.values
    y_train_df = y_arr_to_df(y_arr=y_train)

    # Do OVR encoding on the training targets
    ovr_targets = binary_vector_fun(y_df=y_train_df, chosen_classes=chosen_classes)

    # Repeat for validation data
    x_valid = df_valid.drop(["target", "kfold"], axis=1).values
    y_valid = df_valid.target.values
    y_valid = y_arr_to_df(y_arr=y_valid)

    # Apply feature scaling to the numeric attributes
    sc = StandardScaler()
    x_train[:, 7:] = sc.fit_transform(x_train[:, 7:])
    x_valid[:, 7:] = sc.transform(x_valid[:, 7:])

    ### Need a non-scored df of dimensions equal to validation set
    non_scored_y_valid = copy.deepcopy(y_valid)
    non_scored_y_valid.replace(1, 0, inplace=True)

    ### So interestingly, this needs to be repeated for each of the chosen classes.
    for i in ovr_targets:
        y_temp = copy.deepcopy(i)
        class_name = y_temp.columns[0]

        # Intiialize the classifier
        model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

        # Fit the model
        model.fit(x_train, y_temp.values.ravel())

        # Create predictions
        y_pred_probs = model.predict_proba(x_valid)

        # Fill the non-scored y's
        non_scored_y_valid.loc[:,class_name] = y_pred_probs[:,1]

    # Go through each row and find the column with the larget value
    chosen_classes_per_row = non_scored_y_valid.iloc[:,1:].idxmax(axis=1)

    for index, row in non_scored_y_valid.iterrows():
        max_class = chosen_classes_per_row[index] # Subset the selected class
        row[row.index.isin([max_class, 'sig_id']) == False] = 0
        non_scored_y_valid.iloc[index,:] = row

    # drop the hidden_class column
    non_scored_y_valid.drop(['hidden_class'], axis=1, inplace=True)
    y_valid.drop(['hidden_class'], axis=1, inplace=True)

    log_loss_score = multilabel_log_loss(y_valid=y_valid, y_pred=non_scored_y_valid)

    print(f"Fold={fold}, Log-Loss={log_loss_score}")

In [None]:
for fold_ in range(5):
    run_cv(fold_)

## ovr_cv.py end

## iterstrat

In [None]:
def preprocess_data():
    """Preprocess the data.
    """
    X = pd.read_csv("../input/train_features.csv")
    X.drop(X.columns[0], axis=1, inplace=True)
    y = pd.read_csv("../input/train_targets_scored.csv")
    y.drop(y.columns[0], axis=1, inplace=True)

    # Add hidden class
    zero_class_indices = y[y.iloc[:,1:].apply(sum, axis=1) == 0].index
    y['hidden_class'] = 0
    y.loc[zero_class_indices, 'hidden_class'] = 1
    
    class_counts = y.iloc[:,1:].sum(axis=0)
    class_counts = class_counts.sort_values(ascending=False)
    class_counts_sub = class_counts.head(13)
    chosen_classes = class_counts_sub.index.values

    # Save the column names
    X_col_names = X.columns.tolist()
    cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns
    ohe = OneHotEncoder() # Load OHE
    _ = ohe.fit_transform(X[cat_cols])
    ohe_names = ohe.get_feature_names(cat_cols)
    ohe_names = ohe_names.tolist()

    # Fix new column names to include OHE names and normal feature names
    X_col_names = [col for col in X_col_names if col\
        not in cat_cols]
    ohe_names.extend(X_col_names)

    # Transform the data with OHE on the indices of the cat variables
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
        remainder='passthrough')
    X = pd.DataFrame(ct.fit_transform(X))
    X.columns = ohe_names

    train_idx_list = []; valid_idx_list = []
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for train_index, valid_index in mskf.split(X, y):
        train_idx_list.append(train_index)
        valid_idx_list.append(valid_index)
        
    return X, y, train_idx_list, valid_idx_list, chosen_classes

In [None]:
X, y, train_idx_list, valid_idx_list, chosen_classes = preprocess_data()

In [None]:
def binary_msfk_fun(y_df, chosen_classes):
    """Create a OVR binary vector for a set of chosen classes.
    """
    y_df_copy=copy.deepcopy(y_df)
    y_df_copy.reset_index(drop=True, inplace=True)
    chosen_classes=chosen_classes

    ### The following creates 'c' binary target vectors saved in a list: 'binary_vector_list'
    class_index_list = [] # Save indices that contain the class
    for c in chosen_classes:
        # These are row-indices
        c_indices = y_df_copy[y_df_copy.loc[:,c] == 1].loc[:,c]
        class_index_list.append([c, c_indices])

    # For each class, generate a binary target vector
    binary_vector_list = []; n = y_df.shape[0]

    for i in class_index_list: # Loop through class/index pairs
        zeros = [0] * n # Can't do this actually
        for j in range(n): # Loop through all rows
            # Check if the index should be one instead
            if j in i[1]:
                zeros[j] = 1
        binary_vector_list.append(pd.DataFrame({i[0]: zeros}))

    return binary_vector_list

In [None]:
def run_cv(fold, X, y, train_idx_list, valid_idx_list, chosen_classes, log_loss_list):
    """Run the cross-validation."""
    train_idx = train_idx_list[fold]; valid_idx = valid_idx_list[fold]

    ### These have shifted row names
    x_train = X.iloc[train_idx,:].values; y_train = y.iloc[train_idx,:]
    x_valid = X.iloc[valid_idx,:].values; y_valid = y.iloc[valid_idx,:]

    # Apply feature scaling to the numeric attributes
    sc = StandardScaler()
    x_train[:, 7:] = sc.fit_transform(x_train[:, 7:])
    x_valid[:, 7:] = sc.transform(x_valid[:, 7:])

    ### Need a non-scored df of dimensions equal to validation set
    non_scored_y_valid = copy.deepcopy(y_valid)
    non_scored_y_valid.replace(1, 0, inplace=True)

    # Do OVR encoding on the training targets
    ovr_targets = binary_msfk_fun(y_df=y_train, chosen_classes=chosen_classes)

    for i in ovr_targets: # Loop through the OVR targets and fit a model
        y_temp = copy.deepcopy(i)
        class_name = y_temp.columns[0]

        # Intiialize the classifier
        model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

    ### Is this a bug? There seems to be a class name included in the fit...
        # Fit the model
        model.fit(x_train, y_temp.values.ravel())

        # Create predictions
        y_pred_probs = model.predict_proba(x_valid)

        # Fill the non-scored y's
        non_scored_y_valid.loc[:, class_name] = y_pred_probs[:, 1]

    # Go through each row and find the column with the larget value
    chosen_classes_per_row = non_scored_y_valid.iloc[:, 1:].idxmax(axis=1)

    for index, row in non_scored_y_valid.iterrows():
        max_class = chosen_classes_per_row[index]  # Subset the selected class
        row[row.index.isin([max_class]) == False] = 0
        non_scored_y_valid.loc[index, :] = row

    # drop the hidden_class column
    non_scored_y_valid.drop(["hidden_class"], axis=1, inplace=True)
    y_valid = y_valid.drop(["hidden_class"], axis=1)

    log_loss_score = multilabel_log_loss(y_valid=y_valid, y_pred=non_scored_y_valid)

    print(f"Fold={fold}, Log-Loss={log_loss_score}")
    log_loss_list.append(log_loss_score)

In [None]:
log_loss_list = []
for fold_ in range(5):
    run_cv(fold_, X, y, train_idx_list, valid_idx_list, chosen_classes, log_loss_list)

In [None]:
train_idx_list = []; test_idx_list = []
for train_index, test_index in mskf.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
#     y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]
    train_idx_list.append(train_index)
    test_idx_list.append(test_index)

## end iterstrat

## cf_param_search.py

In [None]:
# from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import itertools

In [None]:
# from xgboost import XGBClassifier

In [None]:
# decision tree, kernelsvm

In [None]:
# clf_list = ['log_reg', 'svm', 'rf', 'knn', 'nb', 'xgb']
clf_list = ['log_reg', 'svm', 'rf', 'knn', 'nb']

param_grid = {
    'log_reg' : {
        'Penalty': ['l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    'svm': {
        'C': [round((0.1) * ((0.1) ** (n - 1)), 5) for n in reversed(range(-3,4))],
        'gamma': ['auto'],
        'class_weight': ['balanced', None],
        'probability': [True]
    },
    'rf': {
        'n_estimators': [120, 300, 500, 800, 1200],
        'max_depth': [5, 8, 15, 25, 30, None],
        'min_samples_split': [1, 2, 5, 10, 15, 100],
        'min_samples_leaf': [1, 2, 5, 10],
        'max_features': ['log2', 'sqrt', None]
    },
    'df': {
        'max_depth': [5, 8, 15, 25, 30, None],
        'min_samples_split': [1, 2, 5, 10, 15, 100],
        'min_samples_leaf': [1, 2, 5, 10],
        'max_features': ['log2', 'sqrt', None]        
    },
    'knn': {
        'n_neighbors': [round((2) * ((2) ** (n - 1)), 5) for n in range(1, 7)],
        'p': [2, 3]
    },
    'nb': {
        'dummy_param': [None, None]
    },
    'xgb': { # Reference: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
        'eta': [0.01, 0.015, 0.025, 0.05, 0.1],
        'gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
        'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
        'min_child_weight': [1, 3, 5, 7],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'lambda': [0.01, 0.1, 1.0],
        'alpha': [0, 0.1, 0.5, 1.0]
    }
}

In [None]:
def set_model_params(clf_name, params):
    """Set the parameters for a model during grid search.
    """
    if clf_name == 'log_reg':    
        model = LogisticRegression(
            penalty=params[0],
            C=params[1],
            random_state=0,
            max_iter=1e10,
        )
    elif clf_name == 'svm':
        model = SVC(
            C=params[0],
            gamma=params[1],
            class_weight=params[2],
            probability=params[3]
        )
    elif clf_name == 'rf':
        model = RandomForestClassifier(
            n_estimators=params[0],
            max_depth=params[1],
            min_samples_split=params[2],
            min_samples_leaf=params[3],
            max_features=params[4]
        )
    elif clf_name == 'dt':
        model = DecisionTreeClassifier(
            max_depth=params[0],
            min_samples_split=params[1],
            min_samples_leaf=params[2],
            max_features=params[3]
        )
    elif clf_name == 'knn':
        model = KNeighborsClassifier(
            n_neighbors=params[0],
            p=params[1]
        )
    elif clf_name == 'nb':
        model = GaussianNB()
    elif clf_name == 'xgb':
        model = XGBClassifier(
            learning_rate=params[0],
            gamma=params[1],
            max_depth=params[2],
            min_child_weight=params[3],
            subsample=params[4],
            colsample_bytree=params[5],
            reg_lambda=params[6],
            reg_alpha=params[7]
        )
        
    return model

In [None]:
clf_idx = clf_list[1]
clf_param_grid = param_grid[clf_idx]
param_names = sorted(clf_param_grid) # Create parameter combinations

In [None]:
for clf_idx in clf_list: # Loop through models
    clf_param_grid = param_grid[clf_idx]

    param_names = [key for key in clf_param_grid.keys()] # Create parameter combinations
    param_combos = itertools.product(*(clf_param_grid[p_name] for p_name in param_names))
    param_combos_list = list(param_combos)

    for p_combo_idx in range(len(param_combos_list)): # Loop through parameters
        # Intiialize the classifier
        param_combo_ = param_combos_list[p_combo_idx]
#         print(param_combo_)
        model_ = set_model_params(clf_name=clf_idx, params=param_combo_)

Loop through models

In [None]:
clf_idx = clf_list[0]
clf_param_grid = param_grid[clf_idx]
clf_param_grid

In [None]:
clf_name = 'log_reg'
params = param_combos_list[0]

In [None]:
for clf_idx in clf_list: # Loop through models
    clf_param_grid = param_grid[clf_idx]
    
    param_names = sorted(clf_param_grid) # Create parameter combinations
    param_combos = itertools.product(*(clf_param_grid[p_name] for p_name in param_names))
    param_combos_list = list(param_combos)
    
    for p_combo_idx in range(len(param_combos_list)): # Loop through model parameters
        temp_param_combo = param_combos_list[p_combo_idx]  # (C, Penalty)
        model = set_model_params(clf_name=clf_idx, params=temp_param_combo)

Loop through parameters

In [None]:
param_test = parameters['log_reg']
param_test

In [None]:
param_names = sorted(param_test)
param_combos = itertools.product(*(param_test[p_name] for p_name in param_names))
param_combos_list = list(param_combos)
param_combos_list

In [None]:
model = LogisticRegression(
    penalty=temp_param_combo[1],
    C=temp_param_combo[0],
    random_state=0,
    max_iter=1e10,
)

In [None]:
log_reg = LogisticRegression()
nb = GaussianNB()
svm = SVC()

# pipeline = Pipeline([
#     ('logreg', log_reg),
#     ('nb', nb),
#     ('svm', svm)
# ])

In [None]:
log_reg

In [None]:
parameters[0]

## end cf_param_search.py

## model_selection.py

In [None]:
# from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import itertools

Currently:
- loop through models
    - loop through

## end model_selection.py

# End CV

## 4.1 Test a model
<a id="test1"></a>
<a href="#top">Back to top</a>

In [None]:
y_df_template[y_arr[0]].iloc[3]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
X = pd.read_csv('../input/train_features.csv')
X.head()

test add a new feature

In [None]:
X.shape[0]

In [None]:
new_feature = list(range(X.shape[0]))
X['new_feature'] = new_feature

In [None]:
X.head()

In [None]:
X.tail()

end test to add feature

In [None]:
y3 = binary_vector_list[0]
y3.head()

In [None]:
X.drop(X.columns[0], axis=1, inplace=True)
# Save the column names
X_col_names = X.columns.tolist()

cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns

ohe = OneHotEncoder() # Load OHE

# Get the column names after OHE
# Reference: https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
_ = ohe.fit_transform(X[cat_cols])
ohe_names = ohe.get_feature_names(cat_cols)
ohe_names = ohe_names.tolist()

# Fix new column names to include OHE names and normal feature names
X_col_names = [col for col in X_col_names if col\
    not in cat_cols]
ohe_names.extend(X_col_names)

# Transform the data with OHE on the indices of the cat variables
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
    remainder='passthrough')
X = pd.DataFrame(ct.fit_transform(X))
X.columns = ohe_names

In [None]:
X.head()

In [None]:
y3.columns = ['target']

In [None]:
df = pd.concat([X, y3], axis=1) # Recombine into single df

df['kfold'] = -1 # Create k-folds column

df = df.sample(frac=1).reset_index(drop=True) # Randomize the dataset

y = df.target.values # Subset the target column

# Initialize the stratified k-fold module from sklearn
kf = StratifiedKFold(n_splits=5)

# Fill the 'kfold' column with the assigned folds
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

In [None]:
df.head()

In [None]:
from sklearn import linear_model
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [None]:
def run(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # Drop the target and and convert to numpy
    x_train = df_train.drop(['target', 'kfold'], axis=1).values
    y_train = df_train.target.values

    # Repeat for validation data
    x_valid = df_valid.drop(['target', 'kfold'], axis=1).values
    y_valid = df_valid.target.values

    # Apply feature scaling to the numeric attributes
    sc = StandardScaler()
    x_train[:,7:] = sc.fit_transform(x_train[:,7:])
    x_valid[:,7:] = sc.transform(x_valid[:,7:])

    # Intiialize the classifier
    model = linear_model.LogisticRegression(random_state=0, max_iter=1e10)

    # Fit the model
    model.fit(x_train, y_train)

    # Create predictions
    y_pred_probs = model.predict_proba(x_valid)
    y_preds = model.predict(x_valid)

    # Calculate and print the accuracy
    log_loss_score = metrics.log_loss(y_valid, y_pred_probs,
        labels=model.classes_)
    auc = metrics.roc_auc_score(y_valid, y_preds,
        labels=model.classes_)
    accuracy = metrics.accuracy_score(y_valid, y_preds)
    
    print(f"Fold={fold}, Log-Loss={log_loss_score}, AUC={auc}, Accuracy={accuracy}")

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

Do CV for all the $c$ classes:

In [None]:
import copy

In [None]:
for i in binary_vector_list:
    y_temp = copy.deepcopy(i)
    class_name = y_temp.columns[0]
    X = pd.read_csv('../input/train_features.csv')
    X.drop(X.columns[0], axis=1, inplace=True)
    
    # Save the column names
    X_col_names = X.columns.tolist()

    cat_cols = ['cp_type', 'cp_time', 'cp_dose'] # Identify categorical columns

    ohe = OneHotEncoder() # Load OHE

    # Get the column names after OHE
    # Reference: https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
    _ = ohe.fit_transform(X[cat_cols])
    ohe_names = ohe.get_feature_names(cat_cols)
    ohe_names = ohe_names.tolist()

    # Fix new column names to include OHE names and normal feature names
    X_col_names = [col for col in X_col_names if col\
        not in cat_cols]
    ohe_names.extend(X_col_names)

    # Transform the data with OHE on the indices of the cat variables
    ct = ColumnTransformer(
        transformers=[('encoder', OneHotEncoder(), list(range(0,3)))],
        remainder='passthrough')
    X = pd.DataFrame(ct.fit_transform(X))
    X.columns = ohe_names
    
    y_temp.columns = ['target']
    
    df = pd.concat([X, y_temp], axis=1) # Recombine into single df

    df['kfold'] = -1 # Create k-folds column

    df = df.sample(frac=1).reset_index(drop=True) # Randomize the dataset

    y = df.target.values # Subset the target column

    # Initialize the stratified k-fold module from sklearn
    kf = StratifiedKFold(n_splits=5)

    # Fill the 'kfold' column with the assigned folds
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
        
    print(class_name)
    for fold_ in range(5):
        run(fold_)

### A simple goal: Try to create predictions for the chosen classes, and output 0's for the others. Make it scalable, so that the chosen classes can be altered.

### Need: Output the probabilities so that the log-loss can be taken.

In [None]:
blank_targets = pd.read_csv('../input/train_targets_nonscored.csv')

In [None]:
blank_targets.head()

In [None]:
class_counts = class_counts.sort_values(ascending=False)
class_counts
# class_counts_sub = class_counts.head(12)