In [2]:
# imports
import pandas as pd

from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyRegressor

# Practice

In [3]:
# get data from the 'houses_clean.csv'. 
# The CSV file is present in the current folder
df = pd.read_csv('houses_clean.csv')


In [4]:
# sanity check, dataframe check
df.head()

Unnamed: 0,beds,baths,size,lot_size,price
0,3,2.5,2590.0,6000.0,795000.0
1,4,2.0,2240.0,0.31,915000.0
2,4,3.0,2040.0,3783.0,950000.0
3,4,3.0,3800.0,5175.0,1950000.0
4,2,2.0,1042.0,0.0,950000.0


In [5]:
# Descriptive Stats
df.describe()

Unnamed: 0,beds,baths,size,lot_size,price
count,2521.0,2521.0,2521.0,2521.0,2521.0
mean,2.877033,2.171956,1758.998017,3240.947017,966821.7
std,1.247518,1.004397,921.608036,2872.985386,887011.1
min,1.0,0.5,250.0,0.0,159000.0
25%,2.0,1.5,1086.0,1.0,605000.0
50%,3.0,2.0,1580.0,3080.0,813000.0
75%,4.0,2.5,2270.0,5500.0,1115000.0
max,15.0,9.0,11010.0,9998.0,25000000.0


In [6]:
# Check for nulls
df.isna().sum()

beds        0
baths       0
size        0
lot_size    0
price       0
dtype: int64

In [9]:
# set target and features
y = df.price
X = df.drop('price', axis=1)

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [11]:
# compare sizes
X_train.shape, X_test.shape

((1890, 4), (631, 4))

In [12]:
# instantiate DummyRegressor() model
dum = DummyRegressor()

In [13]:
# fit
dum.fit(X_train, y_train)

In [14]:
# predict and score
dum_preds = dum.predict(X_test)
dum_r2 = dum.score(X_test, y_test)
dum_rmse = root_mean_squared_error(y_test, dum_preds)
print(f'The r^2 is {dum_r2} and the RMSE is {dum_rmse}')

The r^2 is -0.0028444031200667297 and the RMSE is 1158281.734700673


# ToDo

In [15]:
# Simple Linear Regression
# Create a feature matric with one feature.
X_single_train = X_train[['size']]
X_single_test = X_test[['size']]

In [16]:
# instantiate and train a linear regression algorithm
slr = LinearRegression()
slr.fit(X_single_train, y_train)

In [17]:
# predict and score
slr_preds = slr.predict(X_single_test)  # Predict prices on test set
slr_r2 = slr.score(X_single_test, y_test)  # How much variation the model explains
slr_rmse = root_mean_squared_error(y_test, slr_preds)  # Average error in dollars

print(f'The r^2 is {slr_r2:.4f} and the RMSE is {slr_rmse:,.2f}')

The r^2 is 0.1815 and the RMSE is 1,046,419.36


In [18]:
# instantiate and train an MLR algorithm
mlr = LinearRegression()
mlr.fit(X_train, y_train)

In [19]:
# predict and score
mlr_preds = mlr.predict(X_test)
mlr_r2 = mlr.score(X_test, y_test)
mlr_rmse = root_mean_squared_error(y_test, mlr_preds)

print(f'The r^2 is {mlr_r2:.4f} and the RMSE is {mlr_rmse:,.2f}')

The r^2 is 0.1854 and the RMSE is 1,043,895.64


In [20]:
# Recap
print(f'The r^2 is {dum_r2} and the RMSE is {dum_rmse}')
print(f'The r^2 is {slr_r2} and the RMSE is {slr_rmse}')
print(f'The r^2 is {mlr_r2} and the RMSE is {mlr_rmse}')

The r^2 is -0.0028444031200667297 and the RMSE is 1158281.734700673
The r^2 is 0.18150379977516573 and the RMSE is 1046419.355492563
The r^2 is 0.18544708398987497 and the RMSE is 1043895.6352810638


For any help, refer to the solution notebook present in the current directory. 