In [None]:
import numpy as np
import pandas as pd

import pyarrow.csv as pv
import pyarrow.parquet as pq

import time

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder

---

* In this short notebook I demonstrate two things:

    1. How to load the data quickly with **parquet**.
    2. The **One-vs-Rest** for Multi-Class Classification.
    
---

**1. Parquet**

First I load the train data in `csv` format this takes around **16 seconds**. After that I save the dataframe in `parquet` format using `.to_parquet()` method of pandas. In the last step I load the `parquet` data file and this takes less than **1 second**.

In [None]:
# Read the csv data. This you have to do just once. 
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id',low_memory=True)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv',index_col='row_id', low_memory=True)

# Save the csv file to parquet. This you have to do just once.
train.to_parquet('train_parquet.parquet')
test.to_parquet('test_parquet.parquet')

# Read the parquet data.
train_parquet = pd.read_parquet('train_parquet.parquet')
test_parquet = pd.read_parquet('test_parquet.parquet')

---
**2. One-Vs-Rest for Multi-Class Classification**

The One-vs-Rest strategy splits a multi-class classification into one binary classification problem per class.

One-vs-rest (OvR for short, also referred to as One-vs-All or OvA) is a heuristic method for using binary classification algorithms for multi-class classification.

It involves splitting the multi-class dataset into multiple binary classification problems. A binary classifier is then trained on each binary classification problem and predictions are made using the model that is the most confident.

This approach requires that each model predicts a class membership probability or a probability-like score. The argmax of these scores (class index with the largest score) is then used to predict a class.

One advantage of this approach is its interpretability. Since each class is represented by one and one classifier only, it is possible to gain knowledge about the class by inspecting its corresponding classifier.

A possible downside of this approach is that it requires one model to be created for each class.


In [None]:
list_target = list(train_parquet["target"].unique())
list_features = [col for col in train_parquet.columns if col != "target"]

# generate binary values using get_dummies
train_dummy = pd.get_dummies(train_parquet, columns=["target"], prefix_sep="", prefix="")

pred = pd.DataFrame(index=test_parquet.index)

for target_cat in list_target:
    X = train_dummy[list_features]
    y = train_dummy[target_cat]
    
    lr = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1)

    lr.fit(X, y)
    pred[target_cat] = lr.predict_proba(test_parquet)[:,1]

pred['target'] = pred[list_target].idxmax(1)

In [None]:
submission = pred["target"]
submission.to_csv("submission.csv", index=True)