#### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

#### Pre-processing (improved as much as I could)

In [2]:
DF = pd.read_csv('Housing.csv')
print(DF.head())
print(DF.isnull().sum()) # no missing values found

# Feature Engineering
Features = ['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
X = DF[Features]
y = DF['price']

# Converting categorical Features to numerical
CatFeatures = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
NumFeatures = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Scaling numerical Features
NumTrans = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# OneHot Encoding for categorical Features
CatTrans = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))  # drop='first' to avoid dummy variable trap
])

# Combining the Features again
Preprocessor = ColumnTransformer(
    transformers=[
        ('num', NumTrans, NumFeatures),
        ('cat', CatTrans, CatFeatures)
    ])

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
price               0
area                0
bedrooms            0
bathrooms           0
stories    

#### Creating a model pipeline and training the model

In [3]:
ModelPipeline = Pipeline(steps=[
    ('preprocessor', Preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', LinearRegression())
])
XTrain, XTest, YTrain, YTest = train_test_split(X, y, test_size=0.2, random_state=42)
ModelPipeline.fit(XTrain, YTrain)
YPred = ModelPipeline.predict(XTest)

In [4]:
print(YPred)

[ 4.70447939e+06  7.76828739e+06  3.80335939e+06  4.49148739e+06
  3.79721539e+06  3.27497539e+06  5.36393539e+06  5.70031939e+06
  2.63804739e+06  2.96997254e+06  1.21366714e+07  2.82031939e+06
  2.81827139e+06  3.06956081e+06  3.61494339e+06  2.05027139e+06
  3.54940739e+06  4.32764739e+06  4.65942339e+06  4.22934339e+06
  6.34697539e+06  5.92918339e+06  2.76707139e+06  5.15503939e+06
  4.18019139e+06  7.61468739e+06  4.46281539e+06  4.38089539e+06
  9.72822339e+06  3.17871939e+06  6.39817539e+06  3.63542339e+06
  6.01929539e+06  4.54063939e+06  3.27497539e+06  6.54358339e+06
  5.00553539e+06  2.46396739e+06  2.55612739e+06  5.00348739e+06
  4.09007939e+06  2.82031939e+06  7.72937539e+06  4.47715139e+06
  4.39523139e+06  3.63951939e+06  7.55939139e+06  4.31126339e+06
  3.54121539e+06  3.55759939e+06  7.44060739e+06  2.85308739e+06
  4.25596739e+06  3.62006339e+06  2.96777539e+06  3.01257086e+06
  6.31830339e+06  3.07836739e+06  4.03068739e+06  2.79164739e+06
  3.69481539e+06  2.53359

#### Evaluating the model

In [5]:
MSE = mean_squared_error(YTest, YPred)
R2 = r2_score(YTest, YPred)
print(f"Mean Squared Error: {MSE}")
print(f"R-squared: {R2}")
cvScores = cross_val_score(ModelPipeline, X, y, cv=5, scoring='r2')
print(f"Cross-validated R-squared: {np.mean(cvScores)}")

Mean Squared Error: 1.6115749540478978e+36
R-squared: -3.188352076437246e+23
Cross-validated R-squared: -1.0845265397448855e+18


#### Predicting prices

In [6]:
NewData = pd.DataFrame({
    'area': [1500, 2000],
    'bedrooms': [3, 4],
    'bathrooms': [2, 3],
    'stories': [2, 3],
    'mainroad': ['yes', 'yes'],
    'guestroom': ['no', 'no'],
    'basement': ['no', 'no'],
    'hotwaterheating': ['no', 'no'],
    'airconditioning': ['yes', 'yes'],
    'parking': [2, 3],
    'prefarea': ['yes', 'yes'],
    'furnishingstatus': ['furnished', 'furnished']
})

PredictedPrices = ModelPipeline.predict(NewData)
print(f"Predicted Prices: {PredictedPrices}")

Predicted Prices: [6105311.3853211 9062623.3853211]
