In [2]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "data.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "shree1992/housedata",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)
df.head()

  df = kagglehub.load_dataset(


Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [3]:
pip install kagglehub pandas scikit-learn matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [68]:
#Identifying the target variable (Target)
y = df["price"]
X = df.drop("price", axis=1)


In [69]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns


In [70]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression


In [71]:
#numeric pipeline
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  #Handling missing data
    ("scaler", StandardScaler())                  # Standardization
])


In [72]:
#categorical pipeline
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),   # Handling missing data 
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [73]:
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])


In [74]:
#PIPELINE
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LinearRegression())
])


In [75]:
#SPLIT
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [76]:
pipeline.fit(X_train, y_train)


In [77]:
y_pred = pipeline.predict(X_test)


In [78]:
print("Coefficient (m):", pipeline.named_steps["model"].coef_)

print("Intercept (b):", pipeline.named_steps["model"].intercept_)


Coefficient (m): [-5.29522648e+03 -5.20554107e+03  9.32514183e+04 ... -2.19332233e+04
  2.05576031e+04 -1.36763891e-06]
Intercept (b): 447971.5892112515


In [79]:
y_pred = pipeline.predict(X_test)

In [80]:

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test, y_pred)

print("MSE:", MSE)
print("MAE:", MAE)
print("RMSE:", RMSE)
print("R2 Score:", R2)


MSE: 15953040648.51803
MAE: 86787.32952457291
RMSE: 126305.34687224461
R2 Score: 0.621204897325119


In [81]:
# Remove Outliers
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df["price"] >= lower) & (df["price"] <= upper)]


In [82]:
from scipy import stats
import numpy as np

#Z-Score 
df = df[(np.abs(stats.zscore(df["price"])) < 3)]


In [83]:
#Feature Engineering
df["house_age"] = 2025 - df["yr_built"]
df["price_per_sqft"] = df["price"] / df["sqft_living"]
