## README:

App : **Sample Application**

Stage : **Data preparation**

This is the sample notebook for loading data from warehouse

The notebook expects the required inputs in the adjacent `sample.yaml` file

In [None]:
import os
import sys
import time
import logging
import datetime

import pandas as pd

from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from IPython.display import display
from pprint import pprint

#Rudderlab data utilities imports
from rudderlabs.data.apps.log import setup_file_logger
from rudderlabs.data.apps.config import read_yaml

pd.options.display.max_columns=None
tqdm.pandas()

In [None]:
# Parameters cell for papermill. These values can get overridden by parameters passed by papermill
job_id = str(int(time.time()))
local_input_path = None
local_output_path = None
code_path = "../";

In [None]:
#Initialize input and output paths if they are not passed by papermill
if local_input_path is None:
    local_input_path = f"../data/{job_id}"
    
if local_output_path is None:
    local_output_path = f"../data/{job_id}"

In [None]:
print(job_id)
print(f"local_input_path {local_input_path}")
print(f"local_output_path {local_output_path}")

In [None]:
#Local imports
sys.path.append(code_path)
from data_loader import DataIO

In [None]:
# Constants
# All the required constants are defined here
IMAGE_FORMAT = 'png'

In [None]:
#Logging setup
try:
    log_file_path = os.path.join(local_output_path, "logs", "sample_notebook.log")
    logging = setup_file_logger(log_file_path)
except:
    pass

logging.info("\n\n\t\tSTARTING FEATURE PREPROCESSING")

In [None]:
#Configurations
notebook_config = read_yaml(os.path.join(code_path, "config/sample.yaml"))
print("Notebook config:")
pprint(notebook_config)

In [None]:
creds_config = read_yaml(os.path.join(code_path, "credentials.yaml"))
print("Credentials config:")
pprint(creds_config)

In [None]:
# All the output files get stored in the output_directory. Each run of the feature_processing generates a new sub directory based on the timestamp.
# output directory structure
# - data
#   - <job_id>
#       - sample_data_step
#           - visuals
#           - model_artifacts
output_directory = os.path.join(local_output_path, "sample_data_step")
visuals_dir = os.path.join( output_directory, "visuals" )
model_artifacts_dir = os.path.join(output_directory, "model_artifacts")

logging.info(f"All the output files will be saved to following location: {output_directory}")
for output_path in [output_directory, visuals_dir, model_artifacts_dir]:
    Path(output_path).mkdir(parents=True, exist_ok=True)

In [None]:
#Data splitting
train_split = notebook_config['data']['train_size']
val_split = notebook_config['data']['val_size']
test_split = notebook_config['data']['test_size']

In [None]:
print("Getting data from warehouse")
dataIO = DataIO(notebook_config, creds_config)
data = dataIO.get_data()

In [None]:
data.head()

In [None]:
## Cell to hide code while converting to a html page
from IPython.display import HTML

HTML('''<script>
$('div.input').hide();
</script>''')