In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns
import configparser

pd.set_option('display.max_columns', 150, 'display.max_rows', 255)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
config = configparser.ConfigParser()
config.read('config.py')

password = config['postgresql']['password']

In [3]:
import psycopg2
DATABASE_URI = f'postgresql://postgres:{password}@localhost:5432/home_price_post_db'
connection = psycopg2.connect(DATABASE_URI)

In [4]:
# Replace 'table_name' with the name of the table you want to import
table_name = 'post_home_prices_22column'

# Use the 'pandas.read_sql()' function to import the table into a DataFrame
df = pd.read_sql(f"SELECT * FROM {table_name}", connection)

# Close the database connection
connection.close()

  df = pd.read_sql(f"SELECT * FROM {table_name}", connection)


In [5]:
cols = ['GrLivArea', 'SalePrice']
home_df = df[cols].copy()
home_df.head()

Unnamed: 0,GrLivArea,SalePrice
0,1710,208500
1,1262,181500
2,1786,223500
3,1717,140000
4,2198,250000


In [6]:
# Create a scatter plot with the salary information
Gross_living_plot = home_df.hvplot.scatter(
    x="GrLivArea",
    y="SalePrice",
    title="Gross Living Area"
)
Gross_living_plot

In [7]:
# Create the X set by using the `reshape` function to format the ads data as a single column array.
X = home_df["GrLivArea"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[1710],
       [1262],
       [1786],
       [1717],
       [2198]], dtype=int64)

In [8]:
# Create an array for the dependent variable y with the sales data
y = home_df["SalePrice"]

In [9]:
# Create a model with scikit-learn
model = LinearRegression()

In [10]:
# Fit the data into the model
model.fit(X, y)

In [11]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [107.13035897]


In [12]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 18569.02585648728


In [13]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 18569.02585648728 + 107.13035896582517X


In [14]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [15]:
# Create a copy of the original data
df_sales_predicted = home_df.copy()

# Add a column with the predicted sales values
df_sales_predicted["sales_predicted"] = predicted_y_values

# Display sample data
df_sales_predicted.head()

Unnamed: 0,GrLivArea,SalePrice,sales_predicted
0,1710,208500,201761.94
1,1262,181500,153767.54
2,1786,223500,209903.85
3,1717,140000,202511.85
4,2198,250000,254041.55


In [16]:
# Create a line plot of the predicted salary values
best_fit_line = df_sales_predicted.hvplot.line(
    x = "GrLivArea",
    y = "sales_predicted",
    color = "red"
)
best_fit_line

In [17]:
# Superpose the original data and the best fit line
Gross_living_plot * best_fit_line

In [18]:
# Display the formula to predict the sales by rooms
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 100")

# Predict the sales by rooms
room_ct = model.intercept_ + model.coef_[0] * 2000

# Display the prediction
print(f"Predicted sales by room ct: ${room_ct:.2f}")

Model's formula: y = 18569.02585648728 + 107.13035896582517 * 100
Predicted sales by room ct: $232829.74


In [19]:
# Create an array to predict sales for 100, 150, 200, 250, and 300 ads
gross_sqft = np.array([1000, 1500, 2000, 2500, 3000, 3500])

# Format the array as a one-column array
gross_sqft = gross_sqft.reshape(-1,1)

# Display sample data
gross_sqft

array([[1000],
       [1500],
       [2000],
       [2500],
       [3000],
       [3500]])

In [20]:
predicted_sales = model.predict(gross_sqft)

In [21]:
# Create a DataFrame for the predicted sales
df_predicted_sales = pd.DataFrame(
    {
        "gross_sqft": gross_sqft.reshape(1, -1)[0],
        "predicted_sales": predicted_sales
    }
)

# Display data
df_predicted_sales

Unnamed: 0,gross_sqft,predicted_sales
0,1000,125699.38
1,1500,179264.56
2,2000,232829.74
3,2500,286394.92
4,3000,339960.1
5,3500,393525.28


In [22]:
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.5021486502718042.
The r2 is 0.5021486502718042.
The mean squared error is 3139843209.6665273.
The root mean squared error is 56034.303865279944.
The standard deviation is 79415.29188606751.
