#### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score

#### Pre-Processing

In [2]:
data = pd.read_csv('Housing.csv')
print(data.head())

print(data.isnull().sum()) # as no null values so no futher steps

X = data[['area', 'bedrooms', 'bathrooms']] # only used the 3 features as in the question
y = data['price']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Experimented to find which values give best results
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
price               0
area                0
bedrooms            0
bathrooms           0
stories    

In [3]:
print(X)
print(X_poly)
print(y)

     area  bedrooms  bathrooms
0    7420         4          2
1    8960         4          4
2    9960         3          2
3    7500         4          2
4    7420         4          1
..    ...       ...        ...
540  3000         2          1
541  2400         3          1
542  3620         2          1
543  2910         3          1
544  3850         3          1

[545 rows x 3 columns]
[[ 1.04672629e+00  1.40341936e+00  1.42181174e+00 ...  1.96958589e+00
   1.99539811e+00  2.02154861e+00]
 [ 1.75700953e+00  1.40341936e+00  5.40580863e+00 ...  1.96958589e+00
   7.58661648e+00  2.92227670e+01]
 [ 2.21823241e+00  4.72783117e-02  1.42181174e+00 ...  2.23523876e-03
   6.72208584e-02  2.02154861e+00]
 ...
 [-7.05920661e-01 -1.30886273e+00 -5.70186712e-01 ...  1.71312166e+00
   7.46296139e-01  3.25112886e-01]
 [-1.03338891e+00  4.72783117e-02 -5.70186712e-01 ...  2.23523876e-03
  -2.69574651e-02  3.25112886e-01]
 [-5.99839399e-01  4.72783117e-02 -5.70186712e-01 ...  2.23523876e-03
  -2

#### Model (LR)

In [4]:
# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#### Evaluating the model

In [5]:
# Tried to improve the below as much as I can but can't do more then this on 3 features
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 2577444243113.77
R-squared: 0.49007648178002317


#### Cross-validating and predicting

In [10]:
cv_scores = cross_val_score(model, X_poly, y, cv=3, scoring='r2')
print(f"Cross-validated R-squared: {np.mean(cv_scores)}")

new_data = pd.DataFrame({
    'area': [1500, 2000],
    'bedrooms': [3, 4],
    'bathrooms': [2, 3]
})
new_data_scaled = scaler.transform(new_data)
new_data_poly = poly.transform(new_data_scaled)
predicted_prices = model.predict(new_data_poly)
print(f"Predicted Prices: {predicted_prices}")

Cross-validated R-squared: -6.494824372173756
Predicted Prices: [3356432.02620671 4879327.78078459]
