In [1]:
import pandas as pd
import numpy as np
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression


VA_df = pd.read_csv("VA_Jobs_Mortgage_Rates.csv")
VA_county_df = pd.read_csv("VA_Home_Sales_Count_Prices_Merged.csv")

In [2]:
VA_county_df

Unnamed: 0,County_City,Month,Year,Sale_Count,Median_Sale_Value
0,Accomack County,Apr,2016,42,160000.0
1,Accomack County,Aug,2016,37,159950.0
2,Accomack County,Dec,2016,31,140100.0
3,Accomack County,Feb,2016,15,154000.0
4,Accomack County,Jan,2016,21,145000.0
...,...,...,...,...,...
11394,York County,Mar,2023,101,401270.0
11395,York County,May,2023,114,410000.0
11396,York County,Nov,2023,66,411025.0
11397,York County,Oct,2023,50,410000.0


In [3]:
VA_county_df.isnull().sum()

County_City          0
Month                0
Year                 0
Sale_Count           0
Median_Sale_Value    0
dtype: int64

In [4]:
VA_county_df[VA_county_df['Month'].str.contains('December')]

Unnamed: 0,County_City,Month,Year,Sale_Count,Median_Sale_Value


In [5]:
VA_df

Unnamed: 0,Date,Sale_Count,Median_Sale_Value,va_mortage_jobs.30-Year_Fixed,va_mortage_jobs.15-Year_Fixed,va_mortage_jobs.5-1_Year_Adj
0,5/1/2016,90.465116,202588.1202,3.61,2.86,2.80
1,4/1/2017,83.023622,211624.6339,4.10,3.36,3.19
2,6/1/2016,104.333333,208685.5754,3.66,2.92,2.88
3,4/1/2018,84.859375,210415.1445,4.40,3.87,3.62
4,7/1/2016,89.685039,206065.3189,3.41,2.74,2.68
...,...,...,...,...,...,...
59,9/1/2018,70.186047,214747.5000,4.54,3.99,3.93
60,9/1/2019,80.834646,226680.2362,3.49,3.00,3.30
61,9/1/2020,104.000000,258596.8692,2.93,2.42,2.93
62,9/1/2021,104.204724,284160.3268,2.87,2.18,2.43


In [6]:
#Aggregate the averages of all counties per month in the county df

average_by_month = VA_county_df.groupby(['Month', 'Year']).agg({'Sale_Count':'mean', 'Median_Sale_Value':'mean'}).reset_index()

average_by_month

Unnamed: 0,Month,Year,Sale_Count,Median_Sale_Value
0,Apr,2016,78.992188,194065.780547
1,Apr,2017,83.023622,211624.633858
2,Apr,2018,84.859375,210415.144531
3,Apr,2019,86.218750,222617.195312
4,Apr,2020,80.578125,243615.238281
...,...,...,...,...
84,Sep,2020,104.000000,258596.869231
85,Sep,2021,104.204724,284160.326772
86,Sep,2022,79.283465,305636.385827
87,Sep,2023,61.715385,327606.992308


In [7]:
avg_month_sorted = average_by_month.sort_values(by=['Month','Year'])

va_avg_home_sales_prices = avg_month_sorted.copy()
va_avg_home_sales_prices.to_csv("va_avg_home_sales_prices.csv", index=False)

avg_month_sorted 


Unnamed: 0,Month,Year,Sale_Count,Median_Sale_Value
0,Apr,2016,78.992188,194065.780547
1,Apr,2017,83.023622,211624.633858
2,Apr,2018,84.859375,210415.144531
3,Apr,2019,86.218750,222617.195312
4,Apr,2020,80.578125,243615.238281
...,...,...,...,...
84,Sep,2020,104.000000,258596.869231
85,Sep,2021,104.204724,284160.326772
86,Sep,2022,79.283465,305636.385827
87,Sep,2023,61.715385,327606.992308


In [8]:
#Sort them chronologically
VA_county_df["Year"] = VA_county_df["Year"].astype(str)
# Create date column with the date data already there
VA_county_df['Date'] = pd.to_datetime(VA_county_df['Month'] + ' ' + VA_county_df['Year'])

# Group the data by 'Date' and calculate the mean of the sale values
average_by_date = VA_county_df.groupby(['Month','Year'])['Median_Sale_Value'].mean().round(2).reset_index()

# Sort the DataFrame by 'Date'
average_by_date_sorted = average_by_date.sort_values(by=['Year','Month'])

# Display the resulting DataFrame
average_by_date_sorted


  VA_county_df['Date'] = pd.to_datetime(VA_county_df['Month'] + ' ' + VA_county_df['Year'])


Unnamed: 0,Month,Year,Median_Sale_Value
0,Apr,2016,194065.78
7,Aug,2016,209099.80
15,Dec,2016,199408.13
23,Feb,2016,182094.36
30,Jan,2016,189552.01
...,...,...,...
55,Mar,2023,321243.17
63,May,2023,331314.14
71,Nov,2023,324663.20
79,Oct,2023,328599.38


In [9]:
avg_month_sorted['Month'] = avg_month_sorted['Month'].astype(str)
avg_month_sorted['Year'] = avg_month_sorted['Year'].astype(int)
avg_month_sorted.dtypes


Month                 object
Year                   int32
Sale_Count           float64
Median_Sale_Value    float64
dtype: object

In [11]:
#VA_house_prediction_data.to_csv('VA_house_prediction_data.csv', index=False)


In [12]:
#Linear Regression Model

In [13]:
# Read housing data

df_va_housing_prediction = pd.read_csv("VA_sales_prices_mortage_rates_merged.csv")

# Display sample data
df_va_housing_prediction.head()

Unnamed: 0,Date,Sale_Count,Median_Sale_Value,va_mortage_jobs.30-Year_Fixed,va_mortage_jobs.15-Year_Fixed,va_mortage_jobs.5-1_Year_Adj
0,5/1/2016,90.465116,202588.1202,3.61,2.86,2.8
1,4/1/2017,83.023622,211624.6339,4.1,3.36,3.19
2,6/1/2016,104.333333,208685.5754,3.66,2.92,2.88
3,4/1/2018,84.859375,210415.1445,4.4,3.87,3.62
4,7/1/2016,89.685039,206065.3189,3.41,2.74,2.68


In [14]:
# Create a scatter plot with the median sale information
housing_prediction_plot = df_va_housing_prediction.hvplot.scatter(
    x= "va_mortage_jobs.30-Year_Fixed",
    y= "Median_Sale_Value",
    title="VA Housing Prediction Scatter Plot"
)
housing_prediction_plot

In [15]:
# Reformat data of the independent variable X as a single-column array
X = df_va_housing_prediction["va_mortage_jobs.30-Year_Fixed"].values.reshape(-1, 1)
# Display sample data
X[:5]

array([[3.61],
       [4.1 ],
       [3.66],
       [4.4 ],
       [3.41]])

In [16]:
# The shape of X is 30 samples, with a single feature (column)

X.shape

(64, 1)

In [17]:
# Create an array for the dependent variable y

y = df_va_housing_prediction["Median_Sale_Value"]

In [18]:
# Create a model with scikit-learn

model = LinearRegression()

In [19]:
# Fit the data into the model

model.fit(X, y)

In [20]:
# Display the slope

print(f"Model's slope: {model.coef_}")

Model's slope: [-34458.74184845]


In [21]:
# Display the y-intercept

print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 359019.48544522526


In [22]:
# Display the model's best fit line formula

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 359019.48544522526 + -34458.741848452635X


In [23]:
# Display the formula to predict the housing prices...
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict the housing prices for
y_7 = model.intercept_ + model.coef_[0]

# Display the prediction
print(f"Predicted housing prices given 30 year fixed mortgage: ${y_7:.2f}") 

Model's formula: y = 359019.48544522526 + -34458.741848452635
Predicted housing prices given 30 year fixed mortgage: $324560.74


In [24]:
# Make predictions using the X set
predicted_y_values = model.predict(X) 


In [25]:
# Create a copy of the original data
df_va_housing_prediction_predicted = df_va_housing_prediction.copy()

# Add a column with the predicted housing prices
df_va_housing_prediction_predicted["predicted_housing_prices"] = predicted_y_values


# Display sample data
df_va_housing_prediction_predicted.head() 

Unnamed: 0,Date,Sale_Count,Median_Sale_Value,va_mortage_jobs.30-Year_Fixed,va_mortage_jobs.15-Year_Fixed,va_mortage_jobs.5-1_Year_Adj,predicted_housing_prices
0,5/1/2016,90.465116,202588.1202,3.61,2.86,2.8,234623.427372
1,4/1/2017,83.023622,211624.6339,4.1,3.36,3.19,217738.643867
2,6/1/2016,104.333333,208685.5754,3.66,2.92,2.88,232900.49028
3,4/1/2018,84.859375,210415.1445,4.4,3.87,3.62,207401.021312
4,7/1/2016,89.685039,206065.3189,3.41,2.74,2.68,241515.175742


In [26]:
# Create a line plot of the predicted housing prices
best_fit_line = df_va_housing_prediction_predicted.hvplot.line(
    x = "va_mortage_jobs.30-Year_Fixed",
    y = "predicted_housing_prices",
    color = "red"
)
best_fit_line

In [27]:
# Superpose the original data and the best fit line

housing_prediction_plot * best_fit_line 

In [28]:
#Linear Regression Model Assessment

In [29]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [30]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.4844173925374461.
The r2 is 0.4844173925374461.
The mean squared error is 423896038.35249364.
The root mean squared error is 20588.73571525201.
The standard deviation is 28673.489550821574.


In [31]:
#Linear Regression 2

In [32]:
# Create a scatter plot with the sale count vs 30 year information
housing_prediction_plot2 = df_va_housing_prediction.hvplot.scatter(
    x= "Sale_Count",
    y= "Median_Sale_Value",
    title="VA Housing Prediction Scatter Plot 2"
)
housing_prediction_plot2

In [33]:
# Reformat data of the independent variable X as a single-column array
X = df_va_housing_prediction["Sale_Count"].values.reshape(-1, 1)
# Display sample data
X[:5]

array([[ 90.46511628],
       [ 83.02362205],
       [104.3333333 ],
       [ 84.859375  ],
       [ 89.68503937]])

In [34]:
# The shape of X is 30 samples, with a single feature (column)

X.shape

(64, 1)

In [35]:
# Create an array for the dependent variable y

y = df_va_housing_prediction["Median_Sale_Value"]

In [36]:
# Create a model with scikit-learn

model = LinearRegression()

In [37]:
# Fit the data into the model

model.fit(X, y)

In [38]:
# Display the slope

print(f"Model's slope: {model.coef_}")

Model's slope: [949.40417584]


In [39]:
# Display the y-intercept

print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 148912.55644964258


In [40]:
# Display the model's best fit line formula

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 148912.55644964258 + 949.4041758373412X


In [41]:
# Display the formula to predict the housing prices...
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict the housing prices for
y_7 = model.intercept_ + model.coef_[0]

# Display the prediction
print(f"Predicted housing prices given Sale Count: ${y_7:.2f}") 

Model's formula: y = 148912.55644964258 + 949.4041758373412
Predicted housing prices given Sale Count: $149861.96


In [42]:
# Make predictions using the X set
predicted_y_values = model.predict(X) 


In [43]:
# Create a copy of the original data
df_va_housing_prediction_predicted = df_va_housing_prediction.copy()

# Add a column with the predicted housing prices
df_va_housing_prediction_predicted["predicted_housing_prices"] = predicted_y_values


# Display sample data
df_va_housing_prediction_predicted.head() 

Unnamed: 0,Date,Sale_Count,Median_Sale_Value,va_mortage_jobs.30-Year_Fixed,va_mortage_jobs.15-Year_Fixed,va_mortage_jobs.5-1_Year_Adj,predicted_housing_prices
0,5/1/2016,90.465116,202588.1202,3.61,2.86,2.8,234800.515613
1,4/1/2017,83.023622,211624.6339,4.1,3.36,3.19,227735.529917
2,6/1/2016,104.333333,208685.5754,3.66,2.92,2.88,247967.058764
3,4/1/2018,84.859375,210415.1445,4.4,3.87,3.62,229478.401434
4,7/1/2016,89.685039,206065.3189,3.41,2.74,2.68,234059.907338


In [44]:
# Create a line plot of the predicted housing prices
best_fit_line2 = df_va_housing_prediction_predicted.hvplot.line(
    x = "Sale_Count",
    y = "predicted_housing_prices",
    color = "red"
)
best_fit_line2

In [45]:
# Superpose the original data and the best fit line

housing_prediction_plot2 * best_fit_line2 

In [46]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.35066233988905227.
The r2 is 0.35066233988905227.
The mean squared error is 533865296.6374548.
The root mean squared error is 23105.525240458282.
The standard deviation is 28673.489550821574.


In [47]:
#Linear Regression Model 3

In [48]:
# Create a scatter plot with the sale count vs 30 year information
housing_prediction_plot3 = df_va_housing_prediction.hvplot.scatter(
    x= "va_mortage_jobs.15-Year_Fixed",
    y= "Median_Sale_Value",
    title="VA Housing Prediction Scatter Plot 3"
)
housing_prediction_plot3

In [49]:
# Reformat data of the independent variable X as a single-column array
X = df_va_housing_prediction["va_mortage_jobs.15-Year_Fixed"].values.reshape(-1, 1)
# Display sample data
X[:5]

array([[2.86],
       [3.36],
       [2.92],
       [3.87],
       [2.74]])

In [50]:
# The shape of X is 30 samples, with a single feature (column)

X.shape

(64, 1)

In [51]:
# Create an array for the dependent variable y

y = df_va_housing_prediction["Median_Sale_Value"]

In [52]:
# Create a model with scikit-learn

model = LinearRegression()

In [53]:
# Fit the data into the model

model.fit(X, y)

In [54]:
# Display the slope

print(f"Model's slope: {model.coef_}")

Model's slope: [-32067.53761056]


In [55]:
# Display the y-intercept

print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 330407.0398273684


In [56]:
# Display the model's best fit line formula

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 330407.0398273684 + -32067.53761056317X


In [57]:
# Display the formula to predict the housing prices...
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict the housing prices for
y_7 = model.intercept_ + model.coef_[0]

# Display the prediction
print(f"Predicted housing prices given 15 year fixed: ${y_7:.2f}") 

Model's formula: y = 330407.0398273684 + -32067.53761056317
Predicted housing prices given 15 year fixed: $298339.50


In [58]:
# Make predictions using the X set
predicted_y_values = model.predict(X) 

In [59]:
# Create a copy of the original data
df_va_housing_prediction_predicted = df_va_housing_prediction.copy()

# Add a column with the predicted housing prices
df_va_housing_prediction_predicted["predicted_housing_prices"] = predicted_y_values


# Display sample data
df_va_housing_prediction_predicted.head()

Unnamed: 0,Date,Sale_Count,Median_Sale_Value,va_mortage_jobs.30-Year_Fixed,va_mortage_jobs.15-Year_Fixed,va_mortage_jobs.5-1_Year_Adj,predicted_housing_prices
0,5/1/2016,90.465116,202588.1202,3.61,2.86,2.8,238693.882261
1,4/1/2017,83.023622,211624.6339,4.1,3.36,3.19,222660.113456
2,6/1/2016,104.333333,208685.5754,3.66,2.92,2.88,236769.830005
3,4/1/2018,84.859375,210415.1445,4.4,3.87,3.62,206305.669274
4,7/1/2016,89.685039,206065.3189,3.41,2.74,2.68,242541.986774


In [60]:
# Create a line plot of the predicted housing prices
best_fit_line3 = df_va_housing_prediction_predicted.hvplot.line(
    x = "va_mortage_jobs.15-Year_Fixed",
    y = "predicted_housing_prices",
    color = "red"
)
best_fit_line3

In [61]:
# Superpose the original data and the best fit line

housing_prediction_plot3 * best_fit_line3 

In [62]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.44314118693963955.
The r2 is 0.44314118693963955.
The mean squared error is 457832055.1573351.
The root mean squared error is 21397.010425695807.
The standard deviation is 28673.489550821574.


In [63]:
#Linear Regression Model 4

In [64]:
# Create a scatter plot with the sale count vs 30 year information
housing_prediction_plot4 = df_va_housing_prediction.hvplot.scatter(
    x= "va_mortage_jobs.5-1_Year_Adj",
    y= "Median_Sale_Value",
    title="VA Housing Prediction Scatter Plot 4"
)
housing_prediction_plot4

In [65]:
# Reformat data of the independent variable X as a single-column array
X = df_va_housing_prediction["va_mortage_jobs.5-1_Year_Adj"].values.reshape(-1, 1)
# Display sample data
X[:5]

array([[2.8 ],
       [3.19],
       [2.88],
       [3.62],
       [2.68]])

In [66]:
# The shape of X is 30 samples, with a single feature (column)

X.shape

(64, 1)

In [67]:
# Create an array for the dependent variable y

y = df_va_housing_prediction["Median_Sale_Value"]

In [68]:
# Create a model with scikit-learn

model = LinearRegression()

In [69]:
# Fit the data into the model

model.fit(X, y)

In [70]:
# Display the slope

print(f"Model's slope: {model.coef_}")

Model's slope: [-34026.98286981]


In [71]:
# Display the y-intercept

print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 340089.4443984932


In [72]:
# Display the model's best fit line formula

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 340089.4443984932 + -34026.98286981396X


In [73]:
# Display the formula to predict the housing prices...
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict the housing prices for
y_7 = model.intercept_ + model.coef_[0]

# Display the prediction
print(f"Predicted housing prices given 5-1 year adjusted: ${y_7:.2f}") 

Model's formula: y = 340089.4443984932 + -34026.98286981396
Predicted housing prices given 5-1 year adjusted: $306062.46


In [74]:
# Make predictions using the X set
predicted_y_values = model.predict(X) 

In [75]:
# Create a copy of the original data
df_va_housing_prediction_predicted = df_va_housing_prediction.copy()

# Add a column with the predicted housing prices
df_va_housing_prediction_predicted["predicted_housing_prices"] = predicted_y_values


# Display sample data
df_va_housing_prediction_predicted.head()

Unnamed: 0,Date,Sale_Count,Median_Sale_Value,va_mortage_jobs.30-Year_Fixed,va_mortage_jobs.15-Year_Fixed,va_mortage_jobs.5-1_Year_Adj,predicted_housing_prices
0,5/1/2016,90.465116,202588.1202,3.61,2.86,2.8,244813.892363
1,4/1/2017,83.023622,211624.6339,4.1,3.36,3.19,231543.369044
2,6/1/2016,104.333333,208685.5754,3.66,2.92,2.88,242091.733733
3,4/1/2018,84.859375,210415.1445,4.4,3.87,3.62,216911.76641
4,7/1/2016,89.685039,206065.3189,3.41,2.74,2.68,248897.130307


In [76]:
# Create a line plot of the predicted housing prices
best_fit_line4 = df_va_housing_prediction_predicted.hvplot.line(
    x = "va_mortage_jobs.5-1_Year_Adj",
    y = "predicted_housing_prices",
    color = "red"
)
best_fit_line4

In [77]:
# Superpose the original data and the best fit line

housing_prediction_plot4 * best_fit_line4 

In [78]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.2752765372735504.
The r2 is 0.2752765372735504.
The mean squared error is 595845166.8157854.
The root mean squared error is 24409.939918315766.
The standard deviation is 28673.489550821574.


In [79]:
#Linear Regression Model 5

In [80]:
# Read housing data

df_va_housing_prediction_jobs = pd.read_csv("va_job_sales_mortgage_rates.csv")

# Display sample data
df_va_housing_prediction_jobs.head()

Unnamed: 0,Date,Job_Count,Sale_Count,Median_Sale_Value,30-Year_Fixed,15-Year_Fixed,5-1_Year_Adj
0,5/1/2016,3919.0,90.465116,202588.1202,3.61,2.86,2.8
1,6/1/2016,3944.9,104.333333,208685.5754,3.66,2.92,2.88
2,7/1/2016,3917.2,89.685039,206065.3189,3.41,2.74,2.68
3,8/1/2016,3921.1,91.070312,209099.8047,3.43,2.74,2.73
4,9/1/2016,3932.4,79.634921,199593.3095,3.46,2.77,2.83


In [81]:
# Create a scatter plot with the sale count vs 30 year information
housing_prediction_plot5 = df_va_housing_prediction_jobs.hvplot.scatter(
    x= "Job_Count",
    y= "Median_Sale_Value",
    title="VA Housing Prediction Scatter Plot 5"
)
housing_prediction_plot5

In [91]:
# Reformat data of the independent variable X as a single-column array
X = df_va_housing_prediction_jobs["Job_Count"].values.reshape(-1, 1)
# Display sample data
X[:5]

array([[3919. ],
       [3944.9],
       [3917.2],
       [3921.1],
       [3932.4]])

In [92]:
# The shape of X is 30 samples, with a single feature (column)

X.shape

(64, 1)

In [93]:
# Create an array for the dependent variable y

y = df_va_housing_prediction["Median_Sale_Value"]

In [94]:
# Create a model with scikit-learn

model = LinearRegression()

In [95]:
# Fit the data into the model

model.fit(X, y)

In [96]:
# Display the slope

print(f"Model's slope: {model.coef_}")

Model's slope: [-15.63809135]


In [97]:
# Display the y-intercept

print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 292354.43166526646


In [98]:
# Display the model's best fit line formula

print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 292354.43166526646 + -15.638091349914527X


In [99]:
# Display the formula to predict the housing prices...
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict the housing prices for
y_7 = model.intercept_ + model.coef_[0]

# Display the prediction
print(f"Predicted housing prices based on number of jobs: ${y_7:.2f}") 

Model's formula: y = 292354.43166526646 + -15.638091349914527
Predicted housing prices based on number of jobs: $292338.79


In [100]:
# Make predictions using the X set
predicted_y_values = model.predict(X) 

In [101]:
# Create a copy of the original data
df_va_housing_prediction_jobs_predicted = df_va_housing_prediction_jobs.copy()

# Add a column with the predicted housing prices
df_va_housing_prediction_jobs_predicted["predicted_housing_prices"] = predicted_y_values


# Display sample data
df_va_housing_prediction_jobs_predicted.head()

Unnamed: 0,Date,Job_Count,Sale_Count,Median_Sale_Value,30-Year_Fixed,15-Year_Fixed,5-1_Year_Adj,predicted_housing_prices
0,5/1/2016,3919.0,90.465116,202588.1202,3.61,2.86,2.8,231068.751665
1,6/1/2016,3944.9,104.333333,208685.5754,3.66,2.92,2.88,230663.725099
2,7/1/2016,3917.2,89.685039,206065.3189,3.41,2.74,2.68,231096.900229
3,8/1/2016,3921.1,91.070312,209099.8047,3.43,2.74,2.73,231035.911673
4,9/1/2016,3932.4,79.634921,199593.3095,3.46,2.77,2.83,230859.201241


In [102]:
# Create a line plot of the predicted housing prices
best_fit_line_5 = df_va_housing_prediction_jobs_predicted.hvplot.line(
    x = "Job_Count",
    y = "predicted_housing_prices",
    color = "red"
)
best_fit_line_5

In [103]:
# Superpose the original data and the best fit line

housing_prediction_plot5 * best_fit_line_5 

In [104]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.002592779716166116.
The r2 is 0.002592779716166116.
The mean squared error is 820037299.9067802.
The root mean squared error is 28636.293403769632.
The standard deviation is 28673.489550821574.
