In [2]:
import pandas as pd

df=pd.read_csv('data/house_data_fs.csv', index_col = 0)
#Removes any rows from the DataFrame that contain missing values (NaN).
df = df.dropna()
#two or more input features for multiple linear regression
#input features
#.values converts the selected DataFrame columns into a NumPy array (which is often required by machine learning libraries)
X = df[['bedrooms', 'sqft_living']].values
# target/output variable, trying to predict
Y = df['price'].values

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#fit(X) calculates the mean and standard deviation of each feature (i.e., each column) in X.
#transform(X) then uses those statistics to standardize the features:
#fit(X_train) => calculates the mean and standard deviation of each feature in your training data.
#transform(X_train) => standardizes each feature: Xscaled = X - mean / std
X_scaled = scaler.fit_transform(X)

In [4]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
#Trains (or "fits") the model using your standardized features (X_scaled) and target values (y_train) using scaled data
#Trains the model using:
#X_scaled: standardized input features (bedrooms, sqft_living)
#Y: target values (house prices)
lr.fit(X_scaled, Y)
# Prints the coefficients and intercept of the trained model:
# lr.coef_: array of weights for each input feature
# lr.intercept_: the base predicted value when all inputs are zero
print(lr.coef_, lr.intercept_)
# This is test input: a house with: 2 bedrooms 860 sqft of living area
X_test = [[2, 860]]
#the model expects standardized input, not raw data.
# Uses the same scaler that was fit on training data to standardize the test input.
# This ensures your model sees the test data on the same scale as it was trained on.
X_test_scaled = scaler.transform(X_test)
#Uses the trained model to predict the price for the test house with scaled input.
yp=lr.predict(X_test_scaled)
#Displays the predicted house price based on: 2 bedrooms 860 sqft of living space
print("Prediction with test data :",yp)

# +1 unit (raw) sqft: 860 → 861
#yp_plus_1sqft = lr.predict(scaler.transform([[2, 861]])) 
#print("Effect of 1 sqft (raw):", yp_plus_1sqft - yp)

# +1 std increase in sqft
# yp_plus_1std = lr.predict(scaler.transform([[2, 860 + scaler.scale_[1]]])) 
# print("Effect of +1 std in sqft:", yp_plus_1std - yp)

# # +1 unit (raw) bedrooms: 2 → 3
# # yp_plus_1bedroom = lr.predict(scaler.transform([[3, 860]])) 
# # print("Effect of +1 bedroom (raw):", yp - yp_plus_1bedroom)

# # +1 std increase in bedrooms
# yp_plus_1stdBed = lr.predict(scaler.transform([[2+scaler.scale_[0], 860]])) 
# print("Effect of +1 std in bedrooms:", yp_plus_1stdBed - yp)


# print("predict when increase 1stdBed ",yp_plus_1stdBed)
# print("1std sqft increase ", 860 + scaler.scale_[1])
# print("1std bedroom increase ", 2 + scaler.scale_[0])


[-53567.44561984 288667.55112691] 540166.7341666666
Prediction with test data : [235932.46722026]


In [5]:
#the subset of real houses in the dataset that exactly match your test house (2 bedrooms and 860 sqft).
df1=df[(df['bedrooms'] == 2) & (df['sqft_living'] == 860)]
# print("df1  ",df1)
# print("df1  count",df1.size)
# Calculates the average sale price of those matching houses — 
# i.e., the real-world average price of all houses in the dataset with those same features.
df1_avg = df1['price'].mean()
#Is the model's prediction close to what similar houses actually sold for?
print("Average price:", df1_avg)
print("Predicted price using test input data: ",yp[0])
#the model is performing well — it's close to actual market data
 

Average price: 303760.2105263158
Predicted price using test input data:  235932.46722026315


In [6]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
#Predicts the house prices (Y_pred) using the trained model (lr) on the same training data (X_scaled)
Y_pred = lr.predict(X_scaled)
# R² score (coefficient of determination):
# Measures how well the model explains the variance in the actual target Y.
# Range:
# 1.0 → perfect prediction
# 0.0 → model predicts the mean of Y
# Negative → worse than just predicting the mean
# Example:
# r2_score = 0.85 → 85% of the variation in house prices is explained by the model.
rscore = r2_score(Y, Y_pred)
# Mean Squared Error:
# Average of the squared differences between predicted and actual values on the same training data (X_scaled).
# It punishes larger errors more due to squaring.
# Lower = better.
mse = mean_squared_error(Y, Y_pred) #MSE (useful for optimization)
# Mean Absolute Error:
# Average of the absolute differences between predicted and actual values on the same training data (X_scaled).
# More interpretable (in real units, like dollars).
# Less sensitive to outliers than MSE.
mae = mean_absolute_error(Y, Y_pred)

print('r2-score:', rscore, '\nmean squared error:', mse, '\nmean absolute error:', mae)

r2-score: 0.5068463227480232 
mean squared error: 66465040515.6965 
mean absolute error: 170271.09681228662


In [None]:
#1. What does 1 standard deviation increase mean?

In [21]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,Y, 
#                                     test_size = 0.3, random_state=42)
# ytrain_pred = lr.predict(X_scaled)
# print(X_train.shape, X_test.shape)
# X_test_scaleds = scaler.transform(X_test)
# #Uses the trained model to predict the price for the test house with scaled input.
# yp=lr.predict(X_test_scaleds)
# print(f"Shape of y_test: {y_test.shape}")
# print(f"Shape of yp: {yp.shape}")


In [22]:
# import matplotlib.pyplot as plt
# plt.scatter(y_test, yp)
# plt.xlabel("Actual Price")
# plt.ylabel("Predicted Price")
# plt.title("Prediction Error")

In [7]:
#This tells you which features are most related to price.
#Drop or ignore features with very low or 0 correlation.
# df.corr()['price'].sort_values(ascending=False)

In [None]:
#Handle Outliers
# df.describe()