In [None]:
import pandas as pd
import numpy as np
import sklearn

print(pd.__version__)
print(np.__version__)
print(sklearn.__version__)

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))
print(PROJECT_ROOT)

import importlib

In [None]:
from src.data_loader import load_data
import src.data_loader as dl
print(dl.__file__)

In [None]:
train_data = load_data('../data/raw/train.csv')
test_data = load_data('../data/raw/test.csv')

In [None]:
import src.preprocessing
importlib.reload(src.preprocessing)
from src.preprocessing import fill_age, compute_missing_params, preprocess_data

In [None]:
import src.features
importlib.reload(src.features)
from src.features import get_features

In [None]:
from src.models import train_lr, train_rf

In [None]:
params = compute_missing_params(train_data)

X_train, X_test = preprocess_data(train_data, test_data, params)
passenger_ids = X_test["PassengerId"]
X_train = get_features(X_train)
X_test  = get_features(X_test)
y_train = train_data["Survived"].astype(int)

# print(X_train.shape)
# print(X_test.shape)
# print(X_train.isnull().sum().sum())
# print(test_data.info())
# print(X_train.info())

In [None]:
rf_model = train_rf(X_train, y_train, n_estimators=300, min_samples_leaf=5)
print(rf_model.score(X_train, y_train))
lr_model = train_lr(X_train, y_train)
print(lr_model.score(X_train, y_train))

In [None]:
import src.submit
importlib.reload(src.submit)
from src.submit import make_submission

In [None]:
rf_preds = make_submission(rf_model, X_test, passenger_ids, "../submissions/submission_rf_iter1.csv")
lr_preds = make_submission(lr_model, X_test, passenger_ids, "../submissions/submission_lr_iter1.csv")

In [None]:
print(type(lr_preds))
print(type(rf_preds))

diff = (lr_preds != rf_preds).sum()
print(diff)
idx  = np.where(lr_preds != rf_preds)[0][:10]
pd.DataFrame({
    "PassengerId":test_data.iloc[idx]["PassengerId"].values,
    "LR_Pred":lr_preds[idx],
    "RF_Pred":rf_preds[idx]
})