# Housing Price Prediction Dataset
## RSK World - Free Programming Resources & Source Code
### Website: https://rskworld.in
### Contact: help@rskworld.in, support@rskworld.in
### Phone: +91 93305 39277
### Founder: Molla Samser
### Designer & Tester: Rima Khatun

This notebook provides interactive analysis of the housing price prediction dataset.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Load and Explore the Dataset


In [None]:
# Load the dataset
df = pd.read_csv('housing_prices.csv')

# Display basic information
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


In [None]:
# Statistical summary
df.describe()


In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())


## 2. Data Visualization


In [None]:
# Price distribution
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')
plt.ticklabel_format(style='plain', axis='x')
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# Correlation heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Housing Features')
plt.tight_layout()
plt.show()


In [None]:
# Price vs Square Feet Living
plt.figure(figsize=(10, 6))
plt.scatter(df['sqft_living'], df['price'], alpha=0.5, edgecolors='black', linewidth=0.5)
plt.xlabel('Square Feet Living')
plt.ylabel('Price ($)')
plt.title('Price vs Square Feet Living')
plt.ticklabel_format(style='plain', axis='y')
plt.grid(True, alpha=0.3)
plt.show()


## 3. Feature Engineering


In [None]:
# Select features for modeling
feature_columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                   'waterfront', 'view', 'condition', 'grade', 'sqft_above',
                   'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15']

X = df[feature_columns]
y = df['price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


## 4. Model Training and Evaluation


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


In [None]:
# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

# Evaluate
print("Linear Regression Results:")
print(f"Training R²: {r2_score(y_train, y_train_pred):.4f}")
print(f"Test R²: {r2_score(y_test, y_test_pred):.4f}")
print(f"Training RMSE: ${np.sqrt(mean_squared_error(y_train, y_train_pred)):,.2f}")
print(f"Test RMSE: ${np.sqrt(mean_squared_error(y_test, y_test_pred)):,.2f}")


In [None]:
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest Regression Results:")
print(f"Training R²: {r2_score(y_train, y_train_pred_rf):.4f}")
print(f"Test R²: {r2_score(y_test, y_test_pred_rf):.4f}")
print(f"Training RMSE: ${np.sqrt(mean_squared_error(y_train, y_train_pred_rf)):,.2f}")
print(f"Test RMSE: ${np.sqrt(mean_squared_error(y_test, y_test_pred_rf)):,.2f}")


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importance)), feature_importance['Importance'])
plt.yticks(range(len(feature_importance)), feature_importance['Feature'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(feature_importance)


## 5. Predictions Visualization


In [None]:
# Actual vs Predicted
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Linear Regression: Actual vs Predicted')
plt.ticklabel_format(style='plain')

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred_rf, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Random Forest: Actual vs Predicted')
plt.ticklabel_format(style='plain')

plt.tight_layout()
plt.show()
