In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats
from feature_engine.encoding import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

Last step is to scale the data, once that is done we will make the same changes to the test set that we did on the training set.

## End of preprocessing notebook, next notebook is modeling

In [2]:
# Import df from EDA & Preprocessing Notebook
X_train = pd.read_csv('Data_orleans/X_train_encoded.csv')
X_test = pd.read_csv('Data_orleans/X_test_encoded.csv')
y_train = pd.read_csv('Data_orleans/y_train.csv')
y_test = pd.read_csv('Data_orleans/y_test.csv')

# model

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [4]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [5]:
import statsmodels.api as sm

Next we will make a funtion to evaluate our model's performance. A breif overview of the different metrics:
- Mean Absolute Error (MAE): A basic metric that gives us the absolute difference between actual and predicted values. Idealy the final model would have a smaller MAE than the baseline model since we aim to lower the amount of error by tuning models.
- Mean Squared Error (MSE): Finds the squared difference between the actual and predicted values. The MSE is helpful since it avoids the cancellation of negative terms (unlike MAE). However, outliers will be more damaging since it penalizes the outliers more than MAE. 
- Root Mean Square Error(RMSE): Is the square root of the MSE. RMSE is easier to interpret than MSE since the output is in the same units as the variable
- R Squared (R2): A metric that tells the performance of the model, unlike the others which show the loss. This is a metric that is best for comparing the different models. R2 will be a value betweet 0 and 1, with 1 being a perfect fit.

In [6]:
# function to create displot, Q-Q plot and boxplot
from sklearn.metrics import r2_score

def model_diagnostic(model, X_test, y_test):

    test_pred = model.predict(X_test)
    train_pred = model.predict(X_train)
    test_res = y_test - test_pred
    r2 = r2_score(y_test,test_pred)
    #MAE = mean_absolute_error(y_test,test_pred)
    #MSE = mean_squared_error(y_test,test_pred)
    #RMSE = np.sqrt(MSE)
    
    print(f'Test Mean Absolute Error : {mean_absolute_error(y_test,test_pred)}')
    print(f'Test Mean Squared Error: {mean_squared_error(y_test,test_pred)}')
    print(f'Test Root Mean Square Error : {np.sqrt(mean_squared_error(y_test,test_pred))}')
    print(f'Test R2 : {r2_score(y_test,test_pred)}')
    print(f'Train R2 : {r2_score(y_train,train_pred)}')
    
    # define figure size
    #plt.figure(figsize=(16, 4))
    #fig, (ax1, ax2) = plt.subplots(1, 2)
    #fig, axs = plt.subplots(ncols=2, sharey=False, figsize=(18, 6))
    #ax=axs[0]

    # Q-Q plot
    #plt.subplot(1, 3)
    #stats.probplot(test_res, plot=plt, ax=axs[0])

    # res
    #sns.scatterplot(x=y_test,y=test_res)
    #plt.axhline(y=0, color='r', linestyle='--')
    #plt.title('Residuals')

    #plt.show()

In [7]:
model_diagnostic(model, X_test, y_test)

Test Mean Absolute Error : 46.229388612267655
Test Mean Squared Error: 21861.718155385428
Test Root Mean Square Error : 147.85708692986424
Test R2 : 0.6252643789454987
Train R2 : 0.7871046511117143


We see the baseline model has a R2 score of 0.25 which indicates it is a very poor performing model. 

## Model 2
In regression there are many different model types to choose from. Next, I am going to try a Random Forrest Model which is a little more robust to outliers than a Linear Regression model is. 

In [8]:
# Import Random Forest
# We see the baseline randomforest did even worse, so we do some hyperameter tuning to see if we can improve the model
from sklearn.ensemble import RandomForestClassifier

# first make a baseline random forest model to do a gridsearch on
forest_baseline = RandomForestClassifier()
forest_baseline.fit(X_train, y_train)

model_diagnostic(forest_baseline, X_test, y_test)

Test Mean Absolute Error : 547.0923588039867
Test Mean Squared Error: 1127523.9641196013
Test Root Mean Square Error : 1061.8493132830106
Test R2 : 0.21778532403629974
Train R2 : 1.0


The RandomForest model did much worse than the linear regression model

In [9]:
# Use gridsearch to find a better RandomForest model
# I will start with a spread of different parameter options

param = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 50, 75],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [10, 30, 50],
    'min_samples_split': [2, 5, 10],
}


gridsearch_rf1 = GridSearchCV(estimator=forest_baseline,
                          param_grid=param, cv= 3, scoring='r2')

# Run search & takes a min or two
gridsearch_rf1.fit(X_train, y_train)
gridsearch_rf1.best_params_

KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(gridsearch_rf1.cv_results_)
results.sort_values(by='mean_test_score', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
print(results.shape)

results.head(2)

In [None]:
# plot model performance and error

results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)

plt.ylabel('Mean test score')
plt.xlabel('Hyperparameter combinations')

In [None]:
model_diagnostic(model, X_test_encoded, y_test)

In [None]:
# take a look at how normal the data is
diagnostic_plots(y_train, 'price')

In [None]:
# log transform price and see if that helps the model
y_train['price_log'] = np.log(y_train['price'])
diagnostic_plots(y_train, 'price_log')

In [None]:
# take a look at how much the log transformation helped with the skewness
print(f'''The skewness of the price is \
 {y_train["price"].skew():.2f}''')
print(f'''The skewness of the price is \
 {y_train["price_log"].skew():.2f}''')

In [None]:
y_test['price_log'] = np.log(y_test['price'])

We see that all variations of the random forest model do poorly. We know that the data is skewed so we can try to prep the training data some more and see if that helps. We can try to do a log transformation.