## Project 1: House Price Prediction
### 1. Problem Definition
#### Predict house prices based on feature like size, location, and amenities. 

In [None]:
# We will use Boston data which is available in scikit-learn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load the california housing data
housing = fetch_california_housing()
data = pd.DataFrame(housing.data, columns=housing.feature_names)
data['PRICE'] = housing.target
#display(data)
print(f"Dataset shape: {data.shape}")
print(f"\nFeature names: {housing.feature_names}")
print(f"\nFirst 5 rows of our dataset: {data.head()}")
print(f"\nStatistical summary: {data.describe()}")

In [None]:
# Missing values -  NO MISSING VALUE
print(f"Missing values in each column: {data.isnull().sum()}")

In [None]:
# Visualize the distribution of house prices -HISTOGRAM
plt.figure(figsize=(10,6))
sns.histplot(data['PRICE'], kde=True, )
plt.title('Distribution of House Prices')
plt.xlabel('Prices (x $100k)')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Correlation matrix to see relationship between features
correlation_matrix = data.corr()
plt.figure(figsize=(12,10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
#Split the data into features (X) and target (y)
X=data.drop('PRICE', axis=1)
y=data['PRICE']
# Split into tarining and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [None]:
#Train the model on training data
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
#Evaluate the model using test data
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Let's see the coefficients to understand feature importance
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': linear_model.coef_
})
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)
print("\nFeature Coefficients:")
print(coefficients)

In [19]:
# Using LazyPredict to try multiple models quickly
import lazypredict
from lazypredict.Supervised import LazyRegressor

# Initialize LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit and evaluate multiple models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# Display performance comparison of various models
print("\nLazyPredict Model Comparison:")
print(models)

# Let's visualize the top performing models
plt.figure(figsize=(12, 6))
models_r2 = models.sort_values(by='R-Squared', ascending=False).head(10)
sns.barplot(x=models_r2.index, y=models_r2['R-Squared'])
plt.xticks(rotation=90)
plt.title('Top 10 Models by R-Squared')
plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'lazypredict'