In [2]:
import pandas as pd
import numpy as np

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.feature_selection import SelectKBest, f_regression

data = {
    "Area_sqft": [1500, 1800, 2400, 3000, 3500, 4000, 1200, 2000],
    "Bedrooms": [3, 4, 4, 5, 5, 6, 2, 3],
    "Bathrooms": [2, 3, 3, 4, 4, 5, 1, 2],
    "Location": ["Urban", "Urban", "Suburban", "Suburban", "Rural", "Urban", "Rural", "Suburban"],
    "Condition": ["Good", "Excellent", "Good", "Excellent", "Average", "Good", "Average", "Good"],
    "Price": [300000, 400000, 450000, 600000, 500000, 700000, 200000, 420000]
}

df = pd.DataFrame(data)

print("Dataset Preview:")
print(df)


X = df.drop("Price", axis=1)
y = df["Price"]

label_encoder = LabelEncoder()
X["Condition"] = label_encoder.fit_transform(X["Condition"])

X = pd.get_dummies(X, columns=["Location"], drop_first=True)

print("\nAfter Encoding:")
print(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print("\nTraining Data Size:", X_train.shape)
print("Testing Data Size:", X_test.shape)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

selector = SelectKBest(score_func=f_regression, k=4)

X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

print("\nSelected Features Count:", X_train_selected.shape[1])


model = LinearRegression()
model.fit(X_train_selected, y_train)

y_pred = model.predict(X_test_selected)


print("\nüìå Model Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


new_house = pd.DataFrame({
    "Area_sqft": [2500],
    "Bedrooms": [4],
    "Bathrooms": [3],
    "Condition": ["Good"],
    "Location_Suburban": [1],
    "Location_Urban": [0]
})

new_house["Condition"] = label_encoder.transform(new_house["Condition"])

new_house_scaled = scaler.transform(new_house)
new_house_selected = selector.transform(new_house_scaled)

predicted_price = model.predict(new_house_selected)

print("\nüè† Predicted House Price:", predicted_price[0])

Dataset Preview:
   Area_sqft  Bedrooms  Bathrooms  Location  Condition   Price
0       1500         3          2     Urban       Good  300000
1       1800         4          3     Urban  Excellent  400000
2       2400         4          3  Suburban       Good  450000
3       3000         5          4  Suburban  Excellent  600000
4       3500         5          4     Rural    Average  500000
5       4000         6          5     Urban       Good  700000
6       1200         2          1     Rural    Average  200000
7       2000         3          2  Suburban       Good  420000

After Encoding:
   Area_sqft  Bedrooms  Bathrooms  Condition  Location_Suburban  \
0       1500         3          2          2              False   
1       1800         4          3          1              False   
2       2400         4          3          2               True   
3       3000         5          4          1               True   
4       3500         5          4          0              False 