In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn

#preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.exceptions import NotFittedError
import sklearn
#models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import NuSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from lazypredict import Supervised
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge


# machine learning library
from sklearn.linear_model import LinearRegression

# metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import(recall_score, accuracy_score, f1_score, precision_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc)

#SHAP explainer
import shap
# Ensure your pipeline is defined
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
#LIME
import lime
import lime.lime_tabular

#widgets and dispaly
import ipywidgets as widgets
from IPython.display import display

#utilities
import pickle
import os
from tqdm import tqdm

# Load Dataset

In [None]:
file_path = 'data/top_10_features_dataset.csv'

df = pd.read_csv(file_path)

df.head()

# EDA

In [None]:
num_record = df.shape[0]
num_features = df.shape[1]
data_types = df.dtypes

print(f'Number of records: {num_record}')
print(f'\nNumber of features {num_features}')
print(f'\nData types: \n{data_types}')

In [None]:
#visualize missing values
mn.matrix(df, figsize=(10,5), width_ratios=(5,1), fontsize=12)

In [None]:
duplicated_rows = df.duplicated().sum()
missing_values = df.isna().sum()

print(f'Number of duplicated rows: {duplicated_rows}')
print(f'\nNumber of missing values: \n{missing_values}')

In [None]:
statistics = df.describe()
print(f'\nStatistics: {statistics}')

In [None]:
def check_column_values(df):
    for column in df.columns:
        print(f'Unique Values in {column} column:')
        print(df[column].unique())
        print('\n')

check_column_values(df)

In [None]:
num_cols = df.select_dtypes(include='number')

corr = num_cols.corr()

# Creating the heatmap
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)

plt.show()

In [None]:
target_corr = corr['SalePrice'].sort_values(ascending=False)

# Display top 10 features
top_10_features = target_corr[1:11]  # Exclude 'SalePrice' itself
print(top_10_features)

| Size of Correlation | Interpretation                        |
|---------------------|----------------------------------------|
| .90 to 1.00 (-.90 to -1.00) | Very high positive (negative) correlation |
| .70 to .90 (-.70 to -.90)   | High positive (negative) correlation      |
| .50 to .70 (-.50 to -.70)   | Moderate positive (negative) correlation  |
| .30 to .50 (-.30 to -.50)   | Low positive (negative) correlation       |
| .00 to .30 (.00 to -.30)    | Negligible correlation                    |




### Interpretation

1. **SqFtTotLiving (0.51):**
   - **Moderate positive correlation:** The total living space has a moderately strong relationship with sale price. As living space increases, sale price tends to increase.

2. **Latitude (0.41):**
   - **Low positive correlation:** Latitude has a weak positive relationship with sale price. As latitude increases, sale price tends to increase slightly.

3. **SqFt2ndFloor (0.37):**
   - **Low positive correlation:** The second floor's square footage has a weak positive relationship with sale price. As the second floor's size increases, sale price tends to increase slightly.

In [None]:
df.columns = [col.strip() for col in df.columns]

# check updated column names
print(df.columns)

# check updated column names
df = df.drop_duplicates()

# check for duplicates
print(f'\nnumber of duplicate rows: {df.duplicated().sum()}')

# 2. Data Visualization

In [None]:
#Data Overview
df.hist(figsize=(15,12), bins=10, grid=False)

In [None]:
feature = 'SalePrice'

fig, axes = plt.subplots(nrows=2, figsize=(6,6), sharex=True)
sns.histplot(df[feature], bins='auto', kde=True, ax=axes[0])
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_title(f'Histogram of {feature}')

sns.boxplot(data=df, x=feature, ax=axes[1])
#remove grids
axes[0].grid(False)
axes[1].grid(False)
plt.tight_layout()
plt.show()

### Observations
- The distribution of sale prices is right-skewed, meaning that most of the sale prices are concentrated on the lower end, with a long tail extending to the right.
- There is a peak around $500,000, indicating that this price range has the highest frequency of sales.
- The highest frequency of sale prices falls between $400,000 and $600,000, indicating that most houses are sold within this range.

In [None]:
# Plot the original distribution of SalePrice
plt.figure(figsize=(10, 6))
plt.hist(df['SalePrice'], bins=50, color='blue', alpha=0.7)
plt.title('Original SalePrice Distribution')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

# Scatter Plots and Regression Lines for Features vs Sale Price

In [None]:
numerical_features = ['SqFtTotLiving', 'SqFt2ndFloor','SqFtOpenPorch', 'SqFtFinBasement']
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df[feature], y=df['SalePrice'])
    sns.regplot(x=df[feature], y=df['SalePrice'], scatter=False, color='red')
    plt.title(f'Sale Price vs {feature}')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.show()

## Interpretation

#### Sale Price vs SqFtTotLiving 🏠
- **Interpretation**: The scatter plot shows a strong positive correlation between the total living area (SqFtTotLiving) and the sale price. As the living area increases, the sale price also tends to increase. The red regression line reinforces this positive trend, indicating that larger living spaces are generally associated with higher sale prices.

#### Sale Price vs SqFt2ndFloor 🏢
- **Interpretation**: The scatter plot suggests a moderate positive correlation between the second floor area (SqFt2ndFloor) and the sale price. Homes with more second-floor space tend to have higher sale prices. However, there is a significant number of homes with zero second-floor space, showing that many homes do not have a second floor. The regression line shows an upward trend.

#### Sale Price vs SqFtOpenPorch 🌞
- **Interpretation**: There is a weak positive correlation between the open porch area (SqFtOpenPorch) and the sale price. Homes with larger open porch areas tend to have slightly higher sale prices, but the relationship is not as strong as with living area or second-floor space. The regression line indicates a positive but less pronounced trend.

#### Sale Price vs SqFtFinBasement 🏡
- **Interpretation**: The scatter plot shows a weak to moderate positive correlation between the finished basement area (SqFtFinBasement) and the sale price. Homes with more finished basement space tend to have higher sale prices, but the relationship is not very strong. The regression line shows an upward trend, suggesting that finished basements do add value to homes.



## Removing Outliers

In [None]:
max_threshold = df['SalePrice'].quantile(0.99)
max_threshold

In [None]:
df[df['SalePrice']>max_threshold]

In [None]:
min_threshold = df['SalePrice'].quantile(0.01)
min_threshold

In [None]:
df[df['SalePrice']<min_threshold]

In [None]:
min_threshold, max_threshold = df['SalePrice'].quantile([0.00001, 0.957])
min_threshold, max_threshold

In [None]:
df2 = df[(df['SalePrice'] < max_threshold) & (df['SalePrice'] > min_threshold)]

In [None]:
df2.sample(10)

In [None]:
df.describe().T

In [None]:
fix, axes = plt.subplots(nrows=2, figsize=(6,8), sharex=False)
sns.set_style('white')

#histogram
sns.histplot(df2['SalePrice'], bins='auto', kde=True, ax=axes[0])
axes[0].tick_params(axis='x', rotation=90, labelrotation=45)

#box plot
sns.boxplot(data=df2, x='SalePrice', ax=axes[1])
plt.tight_layout
plt.show()

### Addressing Skewness in dataset

In [None]:
df2['LogSalePrice'] = np.log(df2['SalePrice'])

sns.histplot(df2['LogSalePrice'], kde=True)
plt.show()

In [None]:
df.duplicated().sum()

# 3. Machine Learning


In [None]:
# Split datab
y = df2['LogSalePrice']
X = df2.drop(['LogSalePrice'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.ensemble import IsolationForest
#use Isolation forest to remove outliers
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
mask = yhat != -1

#apply the mask to filter the dataframe rows

X_train_clean = X_train[mask]
y_train_clearn = y_train[mask]

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [None]:
Supervised.removed_regressors.append("QuantileRegressor")
Supervised.REGRESSORS.remove(('QuantileRegressor', sklearn.linear_model._quantile.QuantileRegressor))
lazy_reg = Supervised.LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)

In [None]:


# Fit LazyRegressor on the scaled data
models, predictions = lazy_reg.fit(X_train_scaled, X_test_scaled, y_train, y_test)

# Display the results
print(models)

In [None]:
models_df = models

top_20_models = models_df.sort_values(by='R-Squared', ascending=False).head(20)
top_20_models

# Hyperparameter Tuning 

In [None]:
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, r2

In [None]:
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.svm import SVR

In [None]:
models = {
    'ElasticNet': ElasticNet(),
    'BayesianRidge': BayesianRidge(),
    'SVR': SVR()
}

param_grid = {
    'ElasticNet':{
        'regressor__alpha': [0.1, 1.0, 10.0, 100.0],  # Regularization strength
        'regressor__l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]  # Balance between L1 and L2 regularization
},
    'SVR': {
        'regressor__C': [0.1, 1, 10],
        'regressor__epsilon': [0.01, 0.1, 0.2],
        'regressor__kernel': ['linear', 'rbf']
    },
    'BayesianRidge': {
        'regressor__alpha_1': [1e-6, 1e-5, 1e-4],
        'regressor__alpha_2': [1e-6, 1e-5, 1e-4],
        'regressor__lambda_1': [1e-6, 1e-5, 1e-4],
        'regressor__lambda_2': [1e-6, 1e-5, 1e-4]
    }
}

In [None]:
# Create a loop function to train and evaluate models
metrics_list = []
best_models = {}

def train_and_evaluate_models(X_train, y_train, X_test, y_test, preprocessor):
    for model_name, model in tqdm(models.items(), desc='Training Models'):
        model_pipe = Pipeline(steps=[('regressor', model)])
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(
            estimator=model_pipe,
            param_grid=param_grid[model_name],
            cv=5,
            n_jobs=-1,
            scoring='r2')
        
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_models[model_name] = best_model
        
        # Predictions
        train_pred = best_model.predict(X_train)
        test_pred = best_model.predict(X_test)
        
        # Evaluate classification models
        train_mse, train_rmse, train_mae, train_r2 = evaluate_regression(y_train, train_pred)
        test_mse, test_rmse, test_mae, test_r2 = evaluate_regression(y_test, test_pred)
        
        # Save metrics
        row = {
            'Model Used': model_name,
            'Training MSE': train_mse,
            'Training RMSE': train_rmse,
            'Training MAE': train_mae,
            'Training R²': train_r2,
            'Testing MSE': test_mse,
            'Testing RMSE': test_rmse,
            'Testing MAE': test_mae,
            'Testing R²': test_r2,
            'Best Params': grid_search.best_params_
        }
        metrics_list.append(row)

    # Convert the metrics into a Dataframe
    metrics_df = pd.DataFrame(metrics_list)
    return metrics_df, best_models

In [None]:
metrics_df, best_models = train_and_evaluate_models(X_train, y_train, X_test, y_test, scaler)
metrics_df

In [None]:
#create a function to displat the evaluation results
def display_evaluation_results(metrics_df):
    metrics_df.set_index('Model Used')[['Training R²', 'Testing R²']].plot(kind='bar', figsize=(12,8))

    plt.title('Model Comparison')
    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.xticks(rotation=0)
    plt.legend(loc='best')
    plt.show()

display_evaluation_results(metrics_df)

### Key Findings

#### 1. ExtraTree
- **Training Performance**: High MSE and RMSE indicate overfitting. The model performs very well on the training set (R² = 0.98) but less so on the test set (R² = 0.73).
- **Testing Performance**: Although R² is relatively high, the high MSE and RMSE on the test set suggest variability in predictions.

#### 2. XGBoost
- **Training Performance**: Shows excellent fit on the training data (R² = 0.95).
- **Testing Performance**: Good performance on the test data (R² = 0.75). XGBoost balances performance between training and testing data, making it a reliable model.

#### 3. ElasticNet
- **Training Performance**: Moderate performance on the training set (R² = 0.47).
- **Testing Performance**: Poor performance on the test set (R² = 0.60), indicating it might not capture the underlying patterns effectively.

#### 4. SVR
- **Training Performance**: Decent fit on the training data (R² = 0.31).
- **Testing Performance**: Poor performance on the test set (R² = 0.29). High testing errors suggest overfitting or model inadequacy.

#### 5. GradientBoosting
- **Training Performance**: Good fit on the training data (R² = 0.89).
- **Testing Performance**: Strong performance on the test set (R² = 0.75). This model shows a good balance, similar to XGBoost, with lower test errors.

#### 6. Lasso
- **Training Performance**: Moderate fit on the training data (R² = 0.49).
- **Testing Performance**: Below-average performance on the test set (R² = 0.45), indicating it might struggle with capturing complex patterns.

#### 7. Ridge
- **Training Performance**: Similar to Lasso with moderate fit (R² = 0.49).
- **Testing Performance**: Same as Lasso (R² = 0.45). This indicates that regularization techniques like Ridge and Lasso might need further tuning.

#### 8. AdaBoost
- **Training Performance**: Lower fit on the training data (R² = 0.59).
- **Testing Performance**: Moderate performance on the test set (R² = 0.57). While not the best, it shows reasonable generalization capabilities.

### Best Performing Models
- **XGBoost and GradientBoosting** stand out as the best models based on the Testing R² values (both around 0.75) and relatively lower testing errors. These models demonstrate a good balance between bias and variance, making them suitable for housing price prediction.




## Learning Curves 

In [None]:
#Plot the learning curves
def plot_learning_curve(estimator, X, y, cv=5, n_jobs=None, train_sizes = np.linspace(0.1, 1.0, 5), scoring='r2'):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)

    train_scores_mean = np.mean(train_scores, axis=1) 
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure(figsize=(10,6))
    plt.plot(train_sizes, train_scores_mean, label='Training Scores')
    plt.plot(train_sizes, test_scores_mean, label='Testing Scores')
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.title('Learning Curve')
    plt.legend(loc='best')
    plt.grid(False)
    plt.show

    

#call the learning curve function for each model
for model_name, best_model in best_models.items():
    print(f'Learning CUrve for {model_name}')
    plot_learning_curve(best_model, X_train, y_train)

In [None]:
shap.initjs()

# Train a regression model (e.g., RandomForestRegressor)
model = gbr
model.fit(X_train, y_train)

# Create a SHAP explainer
explainer = shap.Explainer(model, X_train)

# Calculate SHAP values
shap_values = explainer(X_test)

# Plot SHAP summary
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X.columns)

In [None]:
shap.plots.bar(shap_values)

In [None]:


shap.plots.force(shap_values[0])