In [1]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',100)
# Import modeling tools
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# Set DataFrames as default output
from sklearn import set_config
import joblib
set_config(transform_output='pandas')

In [3]:
import os, sys
%load_ext autoreload 
%autoreload 2
import exam_functions as fn

In [4]:
import json
with open('config/filepaths.json') as f:
    FPATHS = json.load(f)
FPATHS

{'data': {'ml': {'train': 'data/part2-training-data.joblib',
   'test': 'data/part2-test-data.joblib'}},
 'models': {'linear_regression': 'models/part2-model-pipeline.joblib'}}

In [5]:
fpath = FPATHS['data']['ml']['test'] 

df_full = joblib.load(fpath)

In [6]:
df_features = df_full[0] 
prices_features = df_full[1]  

df_dataset = pd.concat([df_features, prices_features], axis=1)

df= pd.DataFrame(df_dataset)
df

Unnamed: 0_level_0,bathrooms,bedrooms,sqft_lot,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8651443480,1.0,3,5200,282000.0
2600100370,2.0,4,8793,723000.0
1771110720,1.0,3,9126,330000.0
8085400490,2.5,5,8100,1306000.0
3580900090,2.0,3,9855,300000.0
...,...,...,...,...
8151601090,2.0,4,9099,445000.0
4031700030,2.5,3,9719,299999.0
3812400854,2.0,4,6360,352800.0
5419800330,2.5,3,10652,240000.0


In [7]:
features_to_use =['bathrooms', 'bedrooms', 'sqft_lot']

In [8]:
target = "price"
X = df.drop(columns = target)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train

Unnamed: 0_level_0,bathrooms,bedrooms,sqft_lot
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
415100015,1.00,3,9241
8911000425,1.00,2,8081
3438502290,1.50,3,47743
3625059109,3.00,4,33976
6629300120,1.00,3,6825
...,...,...,...
952004725,1.00,2,5750
1524059027,1.00,2,36478
1787600224,2.50,3,6991
2781250400,2.50,2,4950


In [46]:
df['bedrooms'].max()

10

In [9]:
joblib_train_path = FPATHS['data']['ml']['train']
joblib_train_path

'data/part2-training-data.joblib'

In [10]:
joblib.dump([X_train, y_train], joblib_train_path)

['data/part2-training-data.joblib']

In [11]:
joblib_test_path = FPATHS['data']['ml']['test']
joblib_test_path

'data/part2-test-data.joblib'

In [12]:
joblib.dump([X_test, y_test], joblib_test_path)

['data/part2-test-data.joblib']

In [13]:
# Preprocessing
# Make a preprocessing pipeline
# Separate lists of columns by dtype
num_cols = list(X_train.select_dtypes('number').columns)
ohe_cols = list(X_train.select_dtypes('object').columns)
# Categorical preprocessing pipeline (OHE)
impute_missing = SimpleImputer(strategy='constant', fill_value='MISSING')
ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe_pipe = make_pipeline(impute_missing, ohe_encoder)
# Numeric preprocessing pipeline
impute_nums = SimpleImputer(strategy='mean')
scaler = StandardScaler()
num_pipe = make_pipeline(impute_nums, scaler)
preprocessor = ColumnTransformer([('num', num_pipe, num_cols),
                                     ('cat',ohe_pipe, ohe_cols)],
                                    verbose_feature_names_out=False)
preprocessor

In [14]:
# Add custom functions (from snippets if you have saved it)

def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [15]:
# Linear regression model
# Define model pipeline for linear regression
lin_reg_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('reg',LinearRegression())])
# Fit the model
lin_reg_pipe.fit(X_train, y_train)
# Make predictions and evaluate the model
results = evaluate_regression(lin_reg_pipe, X_train, y_train, X_test, y_test, output_frame=True)
results

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 170,388.476
- MSE = 49,808,798,180.877
- RMSE = 223,178.848
- R^2 = 0.250

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 180,540.810
- MSE = 51,089,808,921.098
- RMSE = 226,030.549
- R^2 = 0.204




Unnamed: 0,MAE,MSE,RMSE,R^2
Training Data,170388.476,49808800000.0,223178.848,0.25
Test Data,180540.81,51089810000.0,226030.549,0.204


In [16]:
# Linear Regression
linreg_path = FPATHS['models']['linear_regression']
linreg_path

'models/part2-model-pipeline.joblib'

In [17]:
joblib.dump(lin_reg_pipe, linreg_path)

['models/part2-model-pipeline.joblib']

In [18]:
# Random Forest Pipeline
rf_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('reg',RandomForestRegressor(max_depth=8, 
                                 min_samples_leaf=2, 
                                 random_state=42))])
rf_pipe.fit(X_train, y_train)

results =evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test, output_frame=True)
results

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 128,451.788
- MSE = 28,269,371,628.306
- RMSE = 168,134.980
- R^2 = 0.574

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 174,710.673
- MSE = 50,728,666,742.327
- RMSE = 225,230.253
- R^2 = 0.209




Unnamed: 0,MAE,MSE,RMSE,R^2
Training Data,128451.788,28269370000.0,168134.98,0.574
Test Data,174710.673,50728670000.0,225230.253,0.209


In [19]:
# Define function to load in model from dictionary with joblib
def load_model_ml(fpath):
    return joblib.load(fpath)

In [20]:
# Load model
linreg = load_model_ml(fpath = FPATHS['models']['linear_regression'])

In [22]:
# Select features
X_train.columns

Index(['bathrooms', 'bedrooms', 'sqft_lot'], dtype='object')

In [24]:
# Numerical features
df['sqft_lot'].describe()

count    1.326000e+03
mean     1.576259e+04
std      5.911189e+04
min      6.090000e+02
25%      5.065000e+03
50%      7.517000e+03
75%      1.037650e+04
max      1.651359e+06
Name: sqft_lot, dtype: float64

In [38]:
selected_sqft = 2000
selected_bedrooms= 2
selected_full_baths= 2

In [39]:
X_train.head(1)

Unnamed: 0_level_0,bathrooms,bedrooms,sqft_lot
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
415100015,1.0,3,9241


In [40]:
# Functionize the creation of the dataframe with selected values
def get_X_to_predict():
    X_to_predict = pd.DataFrame({'sqft_lot': selected_sqft,
                             'bedrooms': selected_bedrooms,
                             'bathrooms':selected_full_baths,
                             },
                             index=['House'])
    return X_to_predict

In [41]:
# Call the function to create dataframe of selected features
X_to_predict = get_X_to_predict()
X_to_predict

Unnamed: 0,sqft_lot,bedrooms,bathrooms
House,2000,2,2


In [42]:
# Get Prediction
linreg.predict(X_to_predict)

array([472262.36309145])

In [43]:
# Get Prediction value
linreg.predict(X_to_predict)[0]

472262.363091452

In [44]:
# Functionalize getting a prediction from the X_to_predict dataframe
def get_prediction(model,X_to_predict):
    return  model.predict(X_to_predict)[0]

In [45]:
# Obtain prediction from selected features
prediction = get_prediction(linreg, X_to_predict)
prediction

472262.363091452