In [1]:
# Testing pprint

from pprint import pprint

data = {'name': 'John', 'age': 30, 'city': 'New York', 'interests': ['reading', 'coding', 'hiking']}

# Normal print
print(data)



{'name': 'John', 'age': 30, 'city': 'New York', 'interests': ['reading', 'coding', 'hiking']}


In [2]:
# Pretty-print with pprint
pprint(data)

{'age': 30,
 'city': 'New York',
 'interests': ['reading', 'coding', 'hiking'],
 'name': 'John'}


### testing config file creation

In [4]:
from pprint import pprint
FPATHS = dict(
    data={
        "raw": {
            "full": "data/ames-housing-dojo-for-ml.csv",  # (This is the original full dataframe we already have)
            "eda": "data/ames-housing-dojo-for-ml-eda.csv" # We haven't saved this yet
        },
        "ml": {
            "train": "data/training-data.joblib",  # (X_train,y_train) We haven't saved this yet
            "test": "data/testing-data.joblib",  # (X_test,y_test) We haven't saved this yet
        },
    },
    models={
        "linear_regression": "models/linear_regression/linreg.joblib", # We haven't saved this yet
        "random_forest": "models/random_forest/rf_reg.joblib", # We haven't saved this yet
    },
    images={
        "banner": "images/app-banner.png", # We haven't saved this yet
    },
)
pprint(FPATHS)


{'data': {'ml': {'test': 'data/testing-data.joblib',
                 'train': 'data/training-data.joblib'},
          'raw': {'eda': 'data/ames-housing-dojo-for-ml-eda.csv',
                  'full': 'data/ames-housing-dojo-for-ml.csv'}},
 'images': {'banner': 'images/app-banner.png'},
 'models': {'linear_regression': 'models/linear_regression/linreg.joblib',
            'random_forest': 'models/random_forest/rf_reg.joblib'}}


In [5]:
 ## Save the filepaths 
import os, json
os.makedirs('config/', exist_ok=True)
FPATHS_FILE = 'config/filepaths.json'
with open(FPATHS_FILE, 'w') as f:
    json.dump(FPATHS, f)



In [7]:
FPATHS_FILE

'config/filepaths.json'

### The function below will create all of the directories (folders) specified in our dictionary

- isinstance(value, dict) is checking whether the variable value is an instance of the dict (dictionary) class.

- if  result is True the variable data (value) is indeed an instance of the dict

In [8]:
import os
def create_directories_from_paths(nested_dict):
    """OpenAI. (2023). ChatGPT [Large language model]. https://chat.openai.com 
    Recursively create directories for file paths in a nested dictionary.
    Parameters:
    nested_dict (dict): The nested dictionary containing file paths.
    """
    for key, value in nested_dict.items():
        if isinstance(value, dict):
            # If the value is a dictionary, recurse into it
            create_directories_from_paths(value)
        elif isinstance(value, str):
            # If the value is a string, treat it as a file path and get the directory path
            directory_path = os.path.dirname(value)
            # If the directory path is not empty and the directory does not exist, create it
            if directory_path and not os.path.exists(directory_path):
                os.makedirs(directory_path)
                print(f"Directory created: {directory_path}")





In [9]:
# Use the function on your FPATHS dictionary
create_directories_from_paths(FPATHS)

Directory created: models/linear_regression
Directory created: models/random_forest
Directory created: images


In [10]:
# We can access a file using our dictionary
FPATHS['data']['raw']['full']



'data/ames-housing-dojo-for-ml.csv'

### Saving Files with Dictionary

In [11]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
# Import modeling tools
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Set DataFrames as default output
from sklearn import set_config
import joblib
set_config(transform_output='pandas')



In [13]:
import os, sys
%load_ext autoreload 
%autoreload 2
import custom_functions as fn


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


NameError: name 'FPATHS' is not defined