# Tabular AutoML

# Viewing the data sources

In [None]:
from pathlib import Path

In [None]:
# constants
DATA_DIR = Path('/kaggle/input')
RANDOM_STATE = 24

# list all file paths in DATA_DIR and its subdirectories
for filepath in DATA_DIR.rglob('*'):
     print(filepath)

# Modelling

In [None]:
!pip install tabular-automl==0.2.0a1

In [None]:
import copy

from tabular_automl import TabularAutoML, TabularData

In [None]:
# utility functions
def get_file_paths(source_dir, file_patterns=None):
    filepaths = []
    if file_patterns is None:
        # remove folders
        paths = list(source_dir.rglob('*'))
        for filepath in paths:
            if filepath.is_file():
                filepaths.append(filepath)
    else:
        # get files matching pattern
        for pattern in file_patterns:
            matches = list(source_dir.rglob(f"*{pattern}*"))
            filepaths.extend(matches)
    return sorted(filepaths)


def get_data(data_path, subsets=["train", "test"], index_col="id"):
    test_file_path, train_file_path = get_file_paths(
        data_path, file_patterns=["train", "test"]
    )
    train_data = TabularData(train_file_path, index_col=index_col)
    test_data = TabularData(test_file_path, index_col=index_col)
    if test_data is None:
        return train_data.data, None
    return train_data.data, test_data.data

def create_pipeline(train_data, test_data, target_col="target", task_type="regression"):
    pipeline = TabularAutoML(
        train_data, test_data=test_data, target_col=target_col, task_type=task_type
    )
    return pipeline

def train_model(pipeline, config):
    best_model = pipeline.get_best_model(config)
    # tuned_model = pipeline.tune_model(estimator=best_model)
    # final_model = pipeline.finalize_model(estimator=tuned_model)
    model = best_model
    return model

def get_predictions(model, test_data=None, predict_proba=False):
    if task_type == "classification":
        predictions = pipeline.predict_model(
            estimator=model, data=test_data, raw_score=predict_proba
        )
    else:
        predictions = pipeline.predict_model(estimator=model, data=test_data)
    display(predictions.head())
    return predictions

def create_submission(model, test_data, label_col="Label", multiclass=False):
    predictions = get_predictions(
        model, test_data=test_data, predict_proba=multiclass
    )
    if multiclass:
        label_cols = [col for col in predictions.columns if "Score" in col]
        submission_cols = [col.replace("Score_", "") for col in label_cols]
        col_mapper = dict(zip(label_cols, submission_cols))
        submission = predictions[label_cols].rename(columns=col_mapper).reset_index()
    else:
        submission = predictions[label_col].rename(target_col).reset_index()

    display(submission.head())
    submission.to_csv(f"{month}_submission.csv")    

In [None]:
# general configuration
config = {
    "sampling": dict(sample_frac=round(1/10, 2)),
    "setup": dict(silent=True),
}

In [None]:
# Jan 2021 TPS
month = "jan"
index_col = "id"
target_col = "target"
task_type = "regression"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
model = train_model(pipeline, config)
create_submission(model, test_data)

In [None]:
# Feb 2021 TPS
month = "feb"
index_col = "id"
target_col = "target"
task_type = "regression"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
model = train_model(pipeline, config)
create_submission(model, test_data)

In [None]:
# March 2021 TPS
month = "mar"
index_col = "id"
target_col = "target"
task_type = "classification"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
model = train_model(pipeline, config)
create_submission(model, test_data, label_col="Score")

In [None]:
# April 2021 TPS
april_config = copy.deepcopy(config)
april_config["sampling"] = dict(sample_frac=round(1/20, 2))

month = "apr"
index_col = "PassengerId"
target_col = "Survived"
task_type = "classification"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
# set a special config for April TPS
model = train_model(pipeline, april_config)
create_submission(model, test_data)

In [None]:
# May 2021 TPS
month = "may"
index_col = "id"
target_col = "target"
task_type = "classification"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
model = train_model(pipeline, config)
create_submission(model, test_data, multiclass=True)

In [None]:
# June 2021 TPS
month = "jun"
index_col = "id"
target_col = "target"
task_type = "classification"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
model = train_model(pipeline, config)
create_submission(model, test_data, multiclass=True)

In [None]:
# Aug 2021 TPS
month = "aug"
index_col = "id"
target_col = "loss"
task_type = "regression"

data_path = get_file_paths(DATA_DIR, file_patterns=[month])[0]
train_data, test_data = get_data(data_path, index_col=index_col)
pipeline = create_pipeline(
    train_data, test_data, target_col=target_col, task_type=task_type
)
model = train_model(pipeline, config)
create_submission(model, test_data)

# 