In [1]:
from pprint import pprint
FPATHS = dict(
    data={
        "raw": {
            "full": "data/ames-housing-dojo-for-ml.csv",  # (This is the original full dataframe we already have)
            "eda": "data/ames-housing-dojo-for-ml-eda.csv" # We haven't saved this yet
        },
        "ml": {
            "train": "data/training-data.joblib",  # (X_train,y_train) We haven't saved this yet
            "test": "data/testing-data.joblib",  # (X_test,y_test) We haven't saved this yet
        },
    },
    models={
        "linear_regression": "models/linear_regression/linreg.joblib", # We haven't saved this yet
        "random_forest": "models/random_forest/rf_reg.joblib", # We haven't saved this yet
    },
    images={
        "banner": "images/app-banner.png", # We haven't saved this yet
    },
)
pprint(FPATHS)



{'data': {'ml': {'test': 'data/testing-data.joblib',
                 'train': 'data/training-data.joblib'},
          'raw': {'eda': 'data/ames-housing-dojo-for-ml-eda.csv',
                  'full': 'data/ames-housing-dojo-for-ml.csv'}},
 'images': {'banner': 'images/app-banner.png'},
 'models': {'linear_regression': 'models/linear_regression/linreg.joblib',
            'random_forest': 'models/random_forest/rf_reg.joblib'}}


In [2]:
 ## Save the filepaths 
import os, json
os.makedirs('config/', exist_ok=True)
FPATHS_FILE = 'config/filepaths.json'
with open(FPATHS_FILE, 'w') as f:
    json.dump(FPATHS, f)



In [3]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
# Import modeling tools
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Set DataFrames as default output
from sklearn import set_config
import joblib
set_config(transform_output='pandas')



In [4]:
import os, sys
%load_ext autoreload 
%autoreload 2
import custom_functions as fn


In [5]:
# Using function from Creating a File Structure Lesson
fn.create_directories_from_paths(FPATHS)

In [6]:
# We can access a file using our dictionary
FPATHS['data']['raw']['full']



'data/ames-housing-dojo-for-ml.csv'

In [7]:
# We can access a file using our dictionary
FPATHS['models']['random_forest']



'models/random_forest/rf_reg.joblib'

In [8]:
# Confirm the images is in the correct location
from IPython.display import display, Markdown
Markdown(f"<img src='{FPATHS['images']['banner']}'>")



<img src='images/app-banner.png'>

[source](https://drive.google.com/file/d/1Ryh99E5iCIvR70UQXS_EehdzYGi7gURY/view)

In [9]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
# Import modeling tools
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Set DataFrames as default output
from sklearn import set_config
import joblib
set_config(transform_output='pandas')



In [10]:
# Open the file structure dictionary

import json
with open('config/filepaths.json') as f:
    FPATHS = json.load(f)
FPATHS



{'data': {'raw': {'full': 'data/ames-housing-dojo-for-ml.csv',
   'eda': 'data/ames-housing-dojo-for-ml-eda.csv'},
  'ml': {'train': 'data/training-data.joblib',
   'test': 'data/testing-data.joblib'}},
 'models': {'linear_regression': 'models/linear_regression/linreg.joblib',
  'random_forest': 'models/random_forest/rf_reg.joblib'},
 'images': {'banner': 'images/app-banner.png'}}

In [11]:
# Define the filepath using the new dictionary structure
fpath = FPATHS['data']['raw']['full']
# Read in as normal
df_full = pd.read_csv(fpath)
df_full = df_full.set_index("PID")
df_full.head()



Unnamed: 0_level_0,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Utilities,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remodeled,Exter Qual,Exter Cond,Bsmt Unf Sqft,Total Bsmnt Sqft,Central Air,Living Area Sqft,Bedroom,Kitchen,Total Rooms,Garage Type,Garage Yr Blt,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Fence,SalePrice,Month,Year,Total Full Baths,Total Half Baths
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
907227090,RL,60.0,7200,Pave,,AllPub,CollgCr,1Fam,1Story,5,8,1972,1972,TA,TA,427.0,864.0,Y,864.0,3,1,5,Detchd,1977.0,1.0,297.0,TA,TA,Y,MnPrv,119900.0,3,2006,1.0,0.0
527108010,RL,134.0,19378,Pave,,AllPub,Gilbert,1Fam,2Story,7,5,2005,2006,Gd,TA,1335.0,1392.0,Y,2462.0,4,1,9,Attchd,2006.0,2.0,576.0,TA,TA,Y,,320000.0,3,2006,3.0,1.0
534275170,RL,,12772,Pave,,AllPub,NAmes,1Fam,1Story,6,8,1960,1998,TA,Gd,460.0,958.0,Y,958.0,2,1,5,Attchd,1960.0,1.0,301.0,TA,TA,Y,,151500.0,4,2007,1.0,0.0
528104050,RL,114.0,14803,Pave,,AllPub,NridgHt,1Fam,1Story,10,5,2007,2008,Ex,TA,442.0,2078.0,Y,2084.0,2,1,7,Attchd,2007.0,3.0,1220.0,TA,TA,Y,,385000.0,6,2008,3.0,0.0
533206070,FV,32.0,3784,Pave,Pave,AllPub,Somerst,TwnhsE,1Story,8,5,2006,2007,Gd,TA,1451.0,1511.0,Y,1565.0,2,1,5,Attchd,2006.0,2.0,476.0,TA,TA,Y,,193800.0,2,2007,3.0,0.0


In [12]:
# Defining which columns to keep for model
target  = "SalePrice"
features_to_use =['Living Area Sqft', 'Lot Frontage', 'Bldg Type', 'Bedroom',
                      'Total Full Baths','MS Zoning','Street', 
                      'Alley','Utilities']


In [13]:
# Make the df_eda with the target and selected features
df_eda = df_full[[target, *features_to_use]].copy()
df_eda



Unnamed: 0_level_0,SalePrice,Living Area Sqft,Lot Frontage,Bldg Type,Bedroom,Total Full Baths,MS Zoning,Street,Alley,Utilities
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
907227090,119900.0,864.0,60.0,1Fam,3,1.0,RL,Pave,,AllPub
527108010,320000.0,2462.0,134.0,1Fam,4,3.0,RL,Pave,,AllPub
534275170,151500.0,958.0,,1Fam,2,1.0,RL,Pave,,AllPub
528104050,385000.0,2084.0,114.0,1Fam,2,3.0,RL,Pave,,AllPub
533206070,193800.0,1565.0,32.0,TwnhsE,2,3.0,FV,Pave,Pave,AllPub
...,...,...,...,...,...,...,...,...,...,...
903400030,109000.0,816.0,50.0,1Fam,2,1.0,RL,Pave,Pave,AllPub
533234020,223000.0,1789.0,79.0,1Fam,3,3.0,FV,Pave,,AllPub
908188140,137500.0,1709.0,24.0,Twnhs,3,2.0,RM,Pave,,AllPub
909254050,231000.0,1512.0,54.0,1Fam,3,3.0,RL,Pave,,AllPub


In [14]:
# Define file path with dictionary
eda_file_path = FPATHS['data']['raw']['eda']
eda_file_path



'data/ames-housing-dojo-for-ml-eda.csv'

In [15]:
# Now save the eda version of the dataset using the dictionary
df_eda.to_csv(eda_file_path)



In [16]:
# Separate features vs target & train/test split
target = "SalePrice"
X = df_eda.drop(columns = target)
y = df_eda[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train



Unnamed: 0_level_0,Living Area Sqft,Lot Frontage,Bldg Type,Bedroom,Total Full Baths,MS Zoning,Street,Alley,Utilities
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
905475520,943.0,,1Fam,3,1.0,RL,Pave,,AllPub
909254010,1228.0,53.0,1Fam,3,1.0,RL,Pave,,AllPub
531450090,1294.0,,1Fam,3,3.0,RL,Pave,,AllPub
903400040,1824.0,60.0,1Fam,3,1.0,RL,Pave,Pave,AllPub
527107130,1628.0,60.0,1Fam,3,2.0,RL,Pave,,AllPub
...,...,...,...,...,...,...,...,...,...
527402220,1002.0,,1Fam,3,2.0,RL,Pave,,AllPub
528435030,1578.0,78.0,1Fam,3,2.0,RL,Pave,,AllPub
528218090,1456.0,63.0,1Fam,3,2.0,RL,Pave,,AllPub
535353130,924.0,,1Fam,2,1.0,RL,Pave,,AllPub


In [17]:
# Defining the filepath for the training data using the dictionary
joblib_train_path = FPATHS['data']['ml']['train']
joblib_train_path



'data/training-data.joblib'

In [18]:
# Saving x-train and y-train a file using path from the dictionary
joblib.dump([X_train, y_train], joblib_train_path)



['data/training-data.joblib']

In [19]:
# Defining the filepath for the testing data using the dictionary
joblib_test_path = FPATHS['data']['ml']['test']
joblib_test_path



'data/testing-data.joblib'

In [20]:
# Saving x-test and y-test in a file using path from the dictionary
joblib.dump([X_test, y_test], joblib_test_path)



['data/testing-data.joblib']

In [21]:
# Make a preprocessing pipeline
# Separate lists of columns by dtype
num_cols = list(X_train.select_dtypes('number').columns)
ohe_cols = list(X_train.select_dtypes('object').columns)
# Categorical preprocessing pipeline (OHE)
impute_missing = SimpleImputer(strategy='constant', fill_value='MISSING')
ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe_pipe = make_pipeline(impute_missing, ohe_encoder)
# Numeric preprocessing pipeline
impute_nums = SimpleImputer(strategy='mean')
scaler = StandardScaler()
num_pipe = make_pipeline(impute_nums, scaler)
preprocessor = ColumnTransformer([('num', num_pipe, num_cols),
                                     ('cat',ohe_pipe, ohe_cols)],
                                    verbose_feature_names_out=False)
preprocessor



In [22]:
# Linear regression model
# Define model pipeline for linear regression
lin_reg_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('reg',LinearRegression())])
# Fit the model
lin_reg_pipe.fit(X_train, y_train)
# Make predictions and evaluate the model
results = fn.evaluate_regression(lin_reg_pipe, X_train, y_train, X_test, y_test)
results



Split,R^2,MAE,RMSE
Train,0.69,30404.72,46356.11
Test,0.65,28349.28,41518.79


In [23]:
# Now we can save our model using the filepath from the dictionary
linreg_path = FPATHS['models']['linear_regression']
linreg_path

# Save linear regression model in a file using path from the dictionary
joblib.dump(lin_reg_pipe, linreg_path)



['models/linear_regression/linreg.joblib']

In [24]:
# Random Forest Pipeline
# Define model pipeline for random forest
rf_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('reg',RandomForestRegressor(max_depth=8, 
                                 min_samples_leaf=2, 
                                 random_state=42))])
# Fit the model
rf_pipe.fit(X_train, y_train)
# Make predictions and evaluate the model
results =fn.evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)
results



Split,R^2,MAE,RMSE
Train,0.86,21389.21,31035.14
Test,0.69,26732.12,38828.04


In [25]:
# Now we can save our model using the filepath from the dictionary
rf_path = FPATHS['models']['random_forest']
rf_path

# Save random forest model in a file using path from the dictionary
joblib.dump(lin_reg_pipe, rf_path)



['models/random_forest/rf_reg.joblib']