In [25]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
#Load dataset
dataset = pd.read_csv("Housing.csv")

In [27]:
dataset.shape

(545, 13)

In [28]:
#preprocess the data
print("First 5 rows of the data:")
print(data.head())

print("\nSummary statistics of the data:")
print(data.describe())

print("\nInformation about the data types and missing values:")
print(data.info())

First 5 rows of the data:
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  

Summary statistics of the data:
              price          area    bed

In [38]:
## Identify non-numeric columns, since Linear Regression model cannot directly process non-numeric values
categorical_cols = data.select_dtypes(include=['object']).columns
print("\nCategorical columns:", categorical_cols)


Categorical columns: Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


In [29]:
# Handle missing values
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 545 entries, 0 to 544
Data columns (total 13 columns):
price               545 non-null int64
area                545 non-null int64
bedrooms            545 non-null int64
bathrooms           545 non-null int64
stories             545 non-null int64
mainroad            545 non-null object
guestroom           545 non-null object
basement            545 non-null object
hotwaterheating     545 non-null object
airconditioning     545 non-null object
parking             545 non-null int64
prefarea            545 non-null object
furnishingstatus    545 non-null object
dtypes: int64(6), object(7)
memory usage: 44.7+ KB


In [39]:
#Encode categorical features using One-Hot Encoding to creating new binary (0 or 1) columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [40]:
# Assuming the features of all columns except price
X = data.drop('price', axis=1)
y = data['price']


In [41]:
print("\nFeatures (X):")
print(X.head())

print("\nTarget Variable (y):")
print(y.head())


Features (X):
   area  bedrooms  bathrooms  stories  parking  mainroad_yes  guestroom_yes  \
0  7420         4          2        3        2             1              0   
1  8960         4          4        4        3             1              0   
2  9960         3          2        2        2             1              0   
3  7500         4          2        2        3             1              0   
4  7420         4          1        2        2             1              1   

   basement_yes  hotwaterheating_yes  airconditioning_yes  prefarea_yes  \
0             0                    0                    1             1   
1             0                    0                    1             0   
2             1                    0                    0             1   
3             1                    0                    1             1   
4             1                    0                    1             0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurn

In [42]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
#Choose a machine learning model
model = LinearRegression()

In [44]:
#Train the model
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [45]:
y_pred = model.predict(X_test)

In [46]:
#Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R^2): {r2:.2f}")


Model Evaluation:
Mean Squared Error (MSE): 1754318687330.66
R-squared (R^2): 0.65
