In [None]:
# ============================================
# HOUSING PRICE PREDICTION - END TO END FLOW
# ============================================

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# --------------------------------------------
# 2. Load Data
# --------------------------------------------
data = pd.read_csv("housing.csv")  # Replace with your file path
print("Data Shape:", data.shape)
print("Columns:", data.columns)

# --------------------------------------------
# 3. Data Sanity Check
# --------------------------------------------
print(data.info())  # Check data types
print(data.describe())  # Summary stats
print("Missing Values:\n", data.isnull().sum())

# Optional: Drop duplicates
data.drop_duplicates(inplace=True)

# --------------------------------------------
# 4. Exploratory Data Analysis (EDA)
# --------------------------------------------
# Univariate Analysis
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(data[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

# Correlation Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(data[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# --------------------------------------------
# 5. Feature Engineering
# --------------------------------------------
# Handle missing values
data.fillna(data.median(), inplace=True)

# Separate features and target
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# --------------------------------------------
# 6. Split Data
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------------------------
# 7. Model Training
# --------------------------------------------
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

# --------------------------------------------
# 8. Model Evaluation
# --------------------------------------------
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"RÂ² Score: {r2}")

# --------------------------------------------
# 9. Save Model (Optional)
# --------------------------------------------
import joblib
joblib.dump(model, "housing_price_model.pkl")
print("Model saved successfully!")
