In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from src.logger import get_logger
from src.preprocess import Preprocess

from src.models.rf import RF
from src.models.knn import KNN
from src.models.svc import SVC
from src.models.adaboost import Adaboost
from src.models.mlp import MLP

import logging
logger = get_logger('src', logging.INFO)

import warnings
warnings.filterwarnings('ignore')

In [3]:
logger.info('Reading train data')
raw_train_df: pd.DataFrame = pd.read_csv('data/train.csv', index_col=0)
logger.info('Reading test data')
raw_test_df: pd.DataFrame = pd.read_csv('data/test.csv', index_col=0)
y_train: pd.Series = raw_train_df['Survived']

logger.info('Preprocessing data')
x_train, x_test = Preprocess(scaler=StandardScaler()) \
  .apply(raw_train_df, raw_test_df)

2022-07-10 22:27:42,269 - INFO - Reading train data
2022-07-10 22:27:42,291 - INFO - Reading test data
2022-07-10 22:27:42,293 - INFO - Preprocessing data
2022-07-10 22:27:42,295 - INFO - Selecting only the desired columns
2022-07-10 22:27:42,296 - INFO - Filling gaps from Age and Embarked columns
2022-07-10 22:27:42,299 - INFO - Applying one-hot encoding to categorical columns
2022-07-10 22:27:42,304 - INFO - Dropping low variance columns
2022-07-10 22:27:42,305 - INFO - Filling gaps from Age and Embarked columns
2022-07-10 22:27:42,307 - INFO - Applying one-hot encoding to categorical columns
2022-07-10 22:27:42,311 - INFO - Dropping low variance columns
2022-07-10 22:27:42,313 - INFO - Scaling the data


In [4]:
logger.info('Calculating predictions')
predictions: pd.DataFrame = pd.DataFrame({
    'rf': RF().predict(x_train, y_train, x_test),
    'knn': KNN().predict(x_train, y_train, x_test),
    # 'svc': SVC().predict(x_train, y_train, x_test),
    'adaboost': Adaboost().predict(x_train, y_train, x_test),
    'mlp': MLP().predict(x_train, y_train, x_test),
})

# workaround for a SVC unknown error
predictions['svc'] = pd.read_csv(
    'grid_search/suport_vector_machine/svc_predictions.csv',
)

predictions['PassengerId'] = raw_test_df.index
predictions['Survived'] = predictions.mode(axis=1)
predictions = predictions[['PassengerId', 'Survived']]

logger.info('Saving predictions to csv')
predictions.to_csv('data/output.csv', index=False)

2022-07-10 22:27:42,321 - INFO - Calculating predictions
2022-07-10 22:27:42,322 - INFO - Predicting with Random Forest model
2022-07-10 22:27:42,395 - INFO - Predicting with KNN model
2022-07-10 22:27:42,403 - INFO - Predicting with Adaboost model
2022-07-10 22:27:42,407 - INFO - Predicting with MLP model
2022-07-10 22:27:42,562 - INFO - Saving predictions to csv
