In [5]:
# -----------------------------
# LINEAR REGRESSION WITH CATEGORICAL FEATURES
# -----------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 1. Load dataset
df = pd.read_csv("House Price Prediction Dataset.csv")

print("Dataset Head:")
print(df.head())

# 2. Identify target column (change if your target name is different)
target_column = "Price"
y = df[target_column]

# Drop target column from feature set
X = df.drop(target_column, axis=1)

# 3. Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

# 4. Preprocessing: One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Build Pipeline (Preprocessing + Model)
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('lr', LinearRegression())
])

# 7. Train the model
model.fit(X_train, y_train)

# 8. Make predictions
y_pred = model.predict(X_test)

# 9. Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print("\nModel Performance:")
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)

# 10. Model is trained successfully
print("\nModel training completed with categorical encoding.")


Dataset Head:
   Id  Area  Bedrooms  Bathrooms  Floors  YearBuilt  Location  Condition  \
0   1  1360         5          4       3       1970  Downtown  Excellent   
1   2  4272         5          4       3       1958  Downtown  Excellent   
2   3  3592         2          2       3       1938  Downtown       Good   
3   4   966         4          2       2       1902  Suburban       Fair   
4   5  4926         1          4       2       1975  Downtown       Fair   

  Garage   Price  
0     No  149919  
1     No  424998  
2     No  266746  
3    Yes  244020  
4    Yes  636056  
Categorical Columns: ['Location', 'Condition', 'Garage']
Numerical Columns: ['Id', 'Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt']

Model Performance:
MAE: 242867.44926338634
MSE: 78279764120.86243
RMSE: 279785.21069002635

Model training completed with categorical encoding.
