In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import kagglehub

from sklearn.preprocessing import LabelEncoder

In [12]:
# Download latest version
path = kagglehub.dataset_download("aravinii/house-price-prediction-treated-dataset")

df = pd.read_csv(path+'/df_train.csv')

In [22]:
df = df.drop('date',axis = 1)

In [23]:
#Defining the dataset

# Initialize LabelEncoder
label_encoder = LabelEncoder()

df['has_basement'] = label_encoder.fit_transform(df['has_basement'])
df['renovated'] = label_encoder.fit_transform(df['renovated'])
df['nice_view'] = label_encoder.fit_transform(df['nice_view'])
df['perfect_condition'] = label_encoder.fit_transform(df['perfect_condition'])
df['has_lavatory'] = label_encoder.fit_transform(df['has_lavatory'])
df['single_floor'] = label_encoder.fit_transform(df['single_floor'])

Y_df = df['price']

In [24]:
print(Y_df)

0        312000.0
1        310000.0
2        320000.0
3        264500.0
4        700000.0
           ...   
13598    353950.0
13599    289950.0
13600    625504.0
13601    730000.0
13602    383000.0
Name: price, Length: 13603, dtype: float64


In [25]:
#Convert to numpy array

X_data_array = df.to_numpy()
Y_data_array = Y_df.to_numpy()
print(Y_data_array)

[312000. 310000. 320000. ... 625504. 730000. 383000.]


In [26]:
#Test Train split

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_data_array,Y_data_array,test_size=0.05)


m = X_train.shape


In [27]:
print(X_train.shape)
print(Y_train.shape)


(12922, 13)
(12922,)


In [28]:
print(X_test.shape)
print(Y_test.shape)

(681, 13)
(681,)


In [29]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
X_train = X_train.reshape(-1,13)
Y_train = Y_train.reshape(-1,1)
lin_reg.fit(X_train,Y_train)

In [45]:
y_predict = lin_reg.predict(X_test.reshape(-1,13))
y_train_predict = lin_reg.predict(X_train)
print("Co-efficients : ",lin_reg.coef_,"\nIntercept : ",lin_reg.intercept_)

Co-efficients :  [[ 1.00000000e+00  1.54647065e-11 -3.86766482e-12  8.67959451e-12
  -4.84937154e-14  7.29141822e-12  7.54438058e-12  9.13990887e-12
   5.19256810e-12  1.56539549e-11 -1.00473680e-11 -2.38409942e-12
   3.67108223e-12]] 
Intercept :  [-1.62981451e-09]


In [31]:
y_train_predict[0]

array([345000.])

In [32]:
Y_train[0]

array([345000.])

In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [35]:
# Compute Mean Squared Error (MSE)
mse = mean_squared_error(Y_test, y_predict)
print("Mean Squared Error (MSE):", mse)

# Compute Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, y_predict)
print("Mean Absolute Error (MAE):", mae)

# Compute R-squared (R²)
r2 = r2_score(Y_test, y_predict)
print("R-squared (R²):", r2)

Mean Squared Error (MSE): 4.266039149389299e-19
Root Mean Squared Error (RMSE): 6.531492286904501e-10
Mean Absolute Error (MAE): 5.334633827844031e-10
R-squared (R²): 1.0


-------- Running test data -----------

In [36]:
df = pd.read_csv(path+'/df_test.csv')
df = df.drop('date',axis = 1)

In [37]:
#Defining the dataset

# Initialize LabelEncoder
label_encoder = LabelEncoder()

df['has_basement'] = label_encoder.fit_transform(df['has_basement'])
df['renovated'] = label_encoder.fit_transform(df['renovated'])
df['nice_view'] = label_encoder.fit_transform(df['nice_view'])
df['perfect_condition'] = label_encoder.fit_transform(df['perfect_condition'])
df['has_lavatory'] = label_encoder.fit_transform(df['has_lavatory'])
df['single_floor'] = label_encoder.fit_transform(df['single_floor'])

Y_test_df = df['price']

In [38]:
#Convert to numpy array

X_test_data_array = df.to_numpy()
Y_test_data_array = Y_test_df.to_numpy()
print(Y_test_data_array)

[305000. 498000. 590000. ... 264000. 612125. 190000.]


In [39]:
print(X_test_data_array.shape)
print(Y_test_data_array.shape)

(6700, 13)
(6700,)


In [42]:
#Prediction using the new training data with the trained model 

y_predict = lin_reg.predict(X_test_data_array.reshape(-1,13))



In [43]:
# Compute Mean Squared Error (MSE)
mse = mean_squared_error(Y_test_data_array, y_predict)
print("Mean Squared Error (MSE):", mse)

# Compute Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test_data_array, y_predict)
print("Mean Absolute Error (MAE):", mae)

# Compute R-squared (R²)
r2 = r2_score(Y_test_data_array, y_predict)
print("R-squared (R²):", r2)

Mean Squared Error (MSE): 4.584593956093119e-19
Root Mean Squared Error (RMSE): 6.770962971463601e-10
Mean Absolute Error (MAE): 5.459769997300942e-10
R-squared (R²): 1.0
