This files creates the training and test datasets for all other notebooks and scripts to reuse. This is the file I would typically use for an exploratory analysis.

In [None]:
%matplotlib inline

import os
from pathlib import Path
import helpsk as hlp
import numpy as np
import pandas as pd

from helpers.utilities import Timer, get_logger

def get_project_directory():
    return os.getcwd().replace('/source/executables', '')

# Data

In [None]:
with Timer("Loading Data"):
    credit_data = pd.pandas.read_pickle(os.path.join('../..', 'artifacts/data/raw/credit.pkl'))

In [None]:
hlp.pandas.numeric_summary(credit_data)

In [None]:
hlp.pandas.non_numeric_summary(credit_data)

# Training and Test Data

In [None]:
with Timer("Loading training/test datasets"):
    X_train = pd.pandas.read_pickle(os.path.join(get_project_directory(), 'artifacts/data/processed/X_train.pkl'))
    X_test = pd.pandas.read_pickle(os.path.join(get_project_directory(), 'artifacts/data/processed/X_test.pkl'))
    y_train = pd.pandas.read_pickle(os.path.join(get_project_directory(), 'artifacts/data/processed/y_train.pkl'))
    y_test = pd.pandas.read_pickle(os.path.join(get_project_directory(), 'artifacts/data/processed/y_test.pkl'))

In [None]:
print(X_train.shape)
print(len(y_train))

print(X_test.shape)
print(len(y_test))

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

In [None]:
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])

# Exploratory

Typically I would do an exploratory analysis here.

In [None]:
X_train.head()

In [None]:
y_train[0:10]

In [None]:
hlp.pandas.numeric_summary(X_train)