# 1. Data Collection and Preparation

In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import requests
from io import StringIO

In [2]:
from google.colab import userdata
api_key = userdata.get('api_key')

# Function to fetch data from FRED
def fetch_fred_data(series_id):
  url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&api_key={api_key}&file_type=csv'
  response = requests.get(url)
  data = pd.read_csv(StringIO(response.text), parse_dates=['date'])
  return data

In [3]:
from google.colab import userdata
api_key = userdata.get('api_key')

def fetch_fred_data(series_id, api_key, column_name):
    url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&api_key={api_key}&file_type=json'

    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Error fetching data from FRED: {response.status_code}")

    data = response.json()
    observations = data['observations']
    df = pd.DataFrame(observations)

    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
    else:
        raise ValueError("Missing 'date' column in the fetched data")

    df['value'] = pd.to_numeric(df['value'], errors='coerce')  # Convert to numeric, coercing errors to NaN
    df.dropna(subset=['value'], inplace=True)  # Drop rows where 'value' could not be converted

    return df[['date', 'value']].rename(columns={'value': column_name})


# Dictionary of series IDs and their descriptive column names
series_dict = {
    'CSUSHPINSA': 'Case_Shiller_Home_Price_Index',
    'UNRATE': 'Unemployment_Rate',
    'CPIAUCSL': 'Inflation',
    'GDP': 'Gross_Domestic_Product',
    'MORTGAGE30US': '30_Year_Fixed_Mortgage_Rate',
    'MEHOINUSA672N': 'Median_Household_Income',
    'HOUST': 'Housing_Starts',
    'POPTHM':'Population',
    'FEDFUNDS': 'Interest_rates'
}

# Fetch data for each series and merge into a single DataFrame
dataframes = []
for series_id, column_name in series_dict.items():
    df = fetch_fred_data(series_id, api_key, column_name)
    dataframes.append(df)

# Merge all DataFrames on the 'date' column
combined_df = dataframes[0]
for df in dataframes[1:]:
    combined_df = combined_df.merge(df, on='date', how='outer')

# Handle missing values by forward filling
combined_df.ffill(inplace=True)

# Handle missing values if necessary
combined_df = combined_df.dropna()

# 2. Model Building

In [4]:
# Splitting the data into training and testing sets
X = combined_df.drop(columns=['date', 'Case_Shiller_Home_Price_Index'])
y = combined_df['Case_Shiller_Home_Price_Index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (2892, 8)
Testing data shape: (724, 8)


## 1. Linear Regression Model

In [5]:
# Initialize and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_model.predict(X_test)

# Evaluate the linear regression model
mse_linear = mean_squared_error(y_test, y_pred)
r2_linear = r2_score(y_test, y_pred)
rmse_linear = math.sqrt(mse_linear)

print(f"Mean Squared Error: {mse_linear}")
print(f"R-squared: {r2_linear}")
print(f"Root Mean Squared Error: {rmse_linear}")

Mean Squared Error: 1363.1730054704492
R-squared: 0.6820660501606239
Root Mean Squared Error: 36.92117286152282


## 2. Random Forest Regression Model

In [6]:
# Initialize and train the Random Forest Regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the Random Forest Regression model
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)
rmse_rf = math.sqrt(mse_rf)

print(f"Mean Squared Error: {mse_rf}")
print(f"R-squared: {r2_rf}")
print(f"Root Mean Squared Error: {rmse_rf}")

Mean Squared Error: 1.1663734787178766
R-squared: 0.9997279657640017
Root Mean Squared Error: 1.0799877215588503


## 3. Decision Tree Regression Model

In [7]:
# Initialize and train the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the Decision Tree Regression model
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
rmse_dt = math.sqrt(mse_dt)

print(f"Mean Squared Error: {mse_dt}")
print(f"R-squared: {r2_dt}")
print(f"Root Mean Squared Error: {rmse_dt}")

Mean Squared Error: 1.500401243093923
R-squared: 0.9996500601965808
Root Mean Squared Error: 1.2249086672458167


## 4. Gradient Boost Regression Model

In [8]:
# Initialize and train the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the Gradient Boosting Regression model
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
rmse_gb = math.sqrt(mse_gb)

print(f"Mean Squared Error: {mse_gb}")
print(f"R-squared: {r2_gb}")
print(f"Root Mean Squared Error: {rmse_gb}")

Mean Squared Error: 1.1964166349100058
R-squared: 0.9997209587742245
Root Mean Squared Error: 1.0938083172613042


## 5. XGBoost Regression Model

In [9]:
# Initialize and train the XGBoost Regression model
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the XGBoost Regression model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
rmse_xgb = math.sqrt(mse_xgb)

print(f"Mean Squared Error: {mse_xgb}")
print(f"R-squared: {r2_xgb}")
print(f"Root Mean Squared Error: {rmse_xgb}")

Mean Squared Error: 1.6676592578760083
R-squared: 0.9996110504736266
Root Mean Squared Error: 1.2913788204380652


In [10]:
from tabulate import tabulate

# Store results
results = [
    {"Model": "Linear Regression", "Mean Squared Error": mse_linear, "R-squared": r2_linear, "Root Mean Squared Error": rmse_linear},
    {"Model": "Random Forest Regression", "Mean Squared Error": mse_rf, "R-squared": r2_rf, "Root Mean Squared Error": rmse_rf},
    {"Model": "Decision Tree Regression", "Mean Squared Error": mse_dt, "R-squared": r2_dt, "Root Mean Squared Error": rmse_dt},
    {"Model": "Gradient Boosting Regression", "Mean Squared Error": mse_gb, "R-squared": r2_gb, "Root Mean Squared Error": rmse_gb},
    {"Model": "XGBoost Regression", "Mean Squared Error": mse_xgb, "R-squared": r2_xgb, "Root Mean Squared Error": rmse_xgb}
]

# Print the tabular representation
print(tabulate(results, headers="keys", tablefmt="grid"))

+------------------------------+----------------------+-------------+---------------------------+
| Model                        |   Mean Squared Error |   R-squared |   Root Mean Squared Error |
| Linear Regression            |           1363.17    |    0.682066 |                  36.9212  |
+------------------------------+----------------------+-------------+---------------------------+
| Random Forest Regression     |              1.16637 |    0.999728 |                   1.07999 |
+------------------------------+----------------------+-------------+---------------------------+
| Decision Tree Regression     |              1.5004  |    0.99965  |                   1.22491 |
+------------------------------+----------------------+-------------+---------------------------+
| Gradient Boosting Regression |              1.19642 |    0.999721 |                   1.09381 |
+------------------------------+----------------------+-------------+---------------------------+
| XGBoost Regression

## Conclusion

The Random Forest Regression model performs the best, with the lowest RMSE and the highest R² score, indicating it has the highest predictive accuracy and best fits the data among the models evaluated. Gradient Boosting and XGBoost also perform well but are slightly outperformed by the Random Forest model. Decision Tree Regression is slightly less accurate but still performs well. Linear Regression, however, performs poorly compared to the other models.

# 3. Analyze the results

In [11]:
# Linear Model Coefficients
coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': linear_model.coef_})
pd.set_option('display.max_rows', None)
print(coefficients.sort_values(by='Coefficient', ascending=False))

                       Feature  Coefficient
0            Unemployment_Rate    12.523903
7               Interest_rates     7.147636
3  30_Year_Fixed_Mortgage_Rate     0.421221
5               Housing_Starts     0.032224
2       Gross_Domestic_Product     0.031409
4      Median_Household_Income     0.002098
6                   Population     0.000250
1                    Inflation    -3.305662


## Analysis

**1. Unemployment Rate (12.523903):**

For each one percentage point increase in the unemployment rate, home prices are expected to increase by 12.52 units, assuming all other factors remain constant. This positive coefficient is counterintuitive, as higher unemployment is generally associated with lower home prices. It suggests there might be other underlying factors or multicollinearity influencing this result.  

**2. Interest Rates (7.147636):**

For each one percentage point increase in interest rates, home prices are expected to increase by 7.15 units, assuming all other factors remain constant. This positive relationship might also seem counterintuitive, as higher interest rates usually lead to higher mortgage costs and lower affordability, typically resulting in lower home prices. Again, this could indicate complex interactions or multicollinearity among the variables.

**3. 30-Year Fixed Mortgage Rate (0.421221):**

For each one percentage point increase in the 30-year fixed mortgage rate, home prices are expected to increase by 0.42 units, assuming all other factors remain constant. This positive coefficient is also unexpected as higher mortgage rates generally reduce housing demand and prices.

**4. Housing Starts (0.032224):**

For each one unit increase in housing starts, home prices are expected to increase by 0.03 units, assuming all other factors remain constant. This suggests a small positive impact, indicating that more new housing construction is slightly associated with higher home prices.

**5. Gross Domestic Product (GDP) (0.031409):**

For each one unit increase in GDP, home prices are expected to increase by 0.03 units, assuming all other factors remain constant. This positive relationship is expected, as a growing economy often leads to higher incomes and increased demand for housing, driving up prices.

**6.Median Household Income (0.002098):**

For each one unit increase in median household income, home prices are expected to increase by 0.002 units, assuming all other factors remain constant. This positive coefficient aligns with expectations, as higher incomes generally increase housing affordability and demand, leading to higher prices.

**7. Population (0.000250):**

For each one unit increase in population, home prices are expected to increase by 0.00025 units, assuming all other factors remain constant. This very small positive impact indicates that population growth is associated with higher home prices, although the effect size is minimal.

**8. Inflation (-3.305662):**

For each one percentage point increase in inflation, home prices are expected to decrease by 3.31 units, assuming all other factors remain constant. This negative relationship can be explained by the fact that higher inflation often leads to higher interest rates, which can reduce housing demand and prices.


## Summary
Positive Coefficients: Unemployment Rate, Interest Rates, 30-Year Fixed Mortgage Rate, Housing Starts, GDP, Median Household Income, and Population. These indicate that as these factors increase, home prices tend to increase as well, though some of these relationships (like with unemployment rate and interest rates) are counterintuitive and may be influenced by underlying factors or multicollinearity.
Negative Coefficient: Inflation. This suggests that higher inflation is associated with lower home prices, which can be due to the impact of inflation on interest rates and overall economic conditions.

In [12]:
# Feature Importances from Random Forest
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_model.feature_importances_})
print(feature_importances.sort_values(by='Importance', ascending=False))

                       Feature  Importance
2       Gross_Domestic_Product    0.707402
1                    Inflation    0.242806
6                   Population    0.042090
4      Median_Household_Income    0.005996
0            Unemployment_Rate    0.001133
7               Interest_rates    0.000382
3  30_Year_Fixed_Mortgage_Rate    0.000141
5               Housing_Starts    0.000050


## Interpretation

**1. Gross Domestic Product (GDP) - 0.707402:**

* **Most Significant Predictor:** GDP is by far the most significant predictor of home prices, with an importance score of 0.707402. This suggests that as the overall economic health (measured by GDP) improves, home prices are likely to increase.

**2. Inflation - 0.242806:**

* **Second Most Significant Predictor:** Inflation also plays a substantial role in predicting home prices. As inflation rises, it can lead to higher home prices due to the increased cost of goods and services, including construction materials and labor.

**3. Population - 0.042090:**

* **Moderately Significant Predictor:** Population growth impacts demand for housing. As population increases, demand for homes typically rises, which can drive up prices.

**4. Median Household Income - 0.005996:**

* **Less Significant Predictor:** Median household income has some impact on home prices, reflecting that higher incomes can support higher home prices, but its influence is relatively small compared to GDP and inflation.

**5. Unemployment Rate - 0.001133:**

* **Minor Predictor:** The unemployment rate has a minimal effect on home prices in this model. While higher unemployment can reduce demand for housing, its impact is not as significant as other factors.

**6. Interest Rates - 0.000382:**

* **Negligible Predictor:** Interest rates have a very small importance score. While interest rates affect mortgage affordability and housing demand, other factors in this model appear to have a much stronger influence.

**7. 30-Year Fixed Mortgage Rate - 0.000141:**

* **Negligible Predictor:** Similar to general interest rates, the specific 30-year fixed mortgage rate has a minimal impact in this model.

**8. Housing Starts - 0.000050:**

* **Least Significant Predictor:** Housing starts, or the number of new residential construction projects, have the least influence on home prices in this model. This might be due to the lag between new constructions and their effect on the overall housing market.

## Conclusion
The feature importance analysis from the Random Forest model highlights that Gross Domestic Product (GDP) is the most significant predictor of home prices, followed by Inflation and Population. These factors have the greatest influence on home prices, whereas features like Interest Rates and Housing Starts have minimal impact in comparison. Understanding these relationships can help in making informed decisions about the housing market and predicting future trends.