In [506]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import datetime as dt
import hvplot.pandas
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
import pprint as pprint
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [507]:
# Reading in the provided CSV file into a pandas DataFrame
df = pd.read_csv("C:/Users/perry/OneDrive/Desktop/MyRepos/project4-team1/data_files/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

# Display the first few rows of the DataFrame to ensure it's loaded correctly
df.head()


Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2022-10-31,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31
0,102001,0,United States,country,,115215.060987,115417.693632,115668.835076,116207.361953,116827.017306,...,342893.791055,342019.27867,340897.172975,340243.409003,340186.438102,341377.341047,342529.876118,344118.510057,346019.054142,348126.180542
1,394913,1,"New York, NY",msa,NY,188937.406428,189741.459747,190551.713646,192192.72184,193888.790935,...,599230.535882,597398.651335,594922.99337,594887.333828,596112.580353,600050.039046,603155.817763,607323.86365,612506.387816,618653.073765
2,753899,2,"Los Angeles, CA",msa,CA,223645.274156,224472.515292,225577.80352,227796.103768,230229.751861,...,901388.388599,897672.54046,893323.330406,884961.557044,874796.651896,867473.497311,865273.506397,868900.838953,876737.86096,889296.983779
3,394463,3,"Chicago, IL",msa,IL,144553.677041,144690.734658,144948.658674,145591.296092,146362.956233,...,295984.602121,295278.942974,294593.348603,295177.6666,296265.50863,298199.210375,299544.045369,301304.103648,303807.012258,306516.933358
4,394514,4,"Dallas, TX",msa,TX,129710.33478,129775.242966,129848.594994,130032.097809,130266.53024,...,380982.32443,379062.994766,376514.69815,373917.253527,371811.542961,371082.877106,370740.713883,371147.729712,372189.451235,373555.542026


In [508]:
# List of cities to filter
cities = ["Austin, TX", "El Paso, TX", "Dallas, TX", "Houston, TX", "San Antonio, TX"]

# Filter for the cities of interest
selected_cities_df = df[df['RegionName'].isin(cities)]

# Convert the dataframe to have dates as rows and cities as columns
melted_df = selected_cities_df.melt(id_vars=["RegionName"], 
                                    value_vars=selected_cities_df.columns[7:], 
                                    var_name="Date", 
                                    value_name="ZHVI")

# Convert the 'Date' column to datetime type and filter for dates from 2018 onwards
melted_df['Date'] = pd.to_datetime(melted_df['Date'])
melted_df = melted_df[melted_df['Date'] >= '2000-01-01']
melted_df_18 = melted_df[melted_df['Date'] >= '2018-01-01']

# Pivot the dataframe to have dates as the index and cities as columns
pivot_df = melted_df.pivot(index='Date', columns='RegionName', values='ZHVI')

pivot_df_18 = melted_df_18.pivot(index='Date', columns='RegionName', values='ZHVI')


# Display the first few rows of the processed dataframe
pivot_df.head()


RegionName,"Austin, TX","Dallas, TX","El Paso, TX","Houston, TX","San Antonio, TX"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-03-31,171884.569134,129848.594994,93406.463524,117792.538671,108402.254879
2000-04-30,172693.585774,130032.097809,93442.396478,117837.439149,108551.915169
2000-05-31,173303.117323,130266.53024,93487.924704,117880.213666,108064.245052
2000-06-30,173575.729427,130500.079302,93533.806108,118061.437933,107379.976616
2000-07-31,173682.480704,130744.571369,93540.520729,118239.445943,106617.696119


In [509]:
# Create a dictionary to hold dataframes for each unique RegionName
region_dfs = {}

# Split the melted_df based on unique RegionName values
for region in melted_df_18['RegionName'].unique():
    region_dfs[region] = melted_df_18[melted_df_18['RegionName'] == region].reset_index(drop=True)


In [510]:
Austin_df = region_dfs["Austin, TX"]
Austin_df.head()

Unnamed: 0,RegionName,Date,ZHVI
0,"Austin, TX",2018-01-31,305316.204598
1,"Austin, TX",2018-02-28,306334.542529
2,"Austin, TX",2018-03-31,307472.851125
3,"Austin, TX",2018-04-30,308408.106838
4,"Austin, TX",2018-05-31,309389.960284


In [511]:
Dallas_df = region_dfs["Dallas, TX"]
Dallas_df.head()

Unnamed: 0,RegionName,Date,ZHVI
0,"Dallas, TX",2018-01-31,242891.968361
1,"Dallas, TX",2018-02-28,244734.313449
2,"Dallas, TX",2018-03-31,246523.677587
3,"Dallas, TX",2018-04-30,248125.31175
4,"Dallas, TX",2018-05-31,249649.21335


In [512]:
San_Antonio_df = region_dfs["San Antonio, TX"]
San_Antonio_df.head()

Unnamed: 0,RegionName,Date,ZHVI
0,"San Antonio, TX",2018-01-31,190361.404635
1,"San Antonio, TX",2018-02-28,191388.501102
2,"San Antonio, TX",2018-03-31,192302.453686
3,"San Antonio, TX",2018-04-30,192982.207645
4,"San Antonio, TX",2018-05-31,193669.646536


In [513]:
El_Paso_df = region_dfs["El Paso, TX"]
El_Paso_df.head()

Unnamed: 0,RegionName,Date,ZHVI
0,"El Paso, TX",2018-01-31,130852.579595
1,"El Paso, TX",2018-02-28,131095.292025
2,"El Paso, TX",2018-03-31,131245.16359
3,"El Paso, TX",2018-04-30,131331.111332
4,"El Paso, TX",2018-05-31,131560.095513


In [514]:
Houston_df = region_dfs["Houston, TX"]
Houston_df.head()

Unnamed: 0,RegionName,Date,ZHVI
0,"Houston, TX",2018-01-31,203501.279445
1,"Houston, TX",2018-02-28,204056.924777
2,"Houston, TX",2018-03-31,204818.959046
3,"Houston, TX",2018-04-30,205647.961369
4,"Houston, TX",2018-05-31,206682.977759


In [515]:
combined_df = pd.concat([Austin_df, Dallas_df, Houston_df, San_Antonio_df, El_Paso_df], axis=0)
combined_df.reindex()

Unnamed: 0,RegionName,Date,ZHVI
0,"Austin, TX",2018-01-31,305316.204598
1,"Austin, TX",2018-02-28,306334.542529
2,"Austin, TX",2018-03-31,307472.851125
3,"Austin, TX",2018-04-30,308408.106838
4,"Austin, TX",2018-05-31,309389.960284
...,...,...,...
62,"El Paso, TX",2023-03-31,203126.801825
63,"El Paso, TX",2023-04-30,204405.531273
64,"El Paso, TX",2023-05-31,205893.729025
65,"El Paso, TX",2023-06-30,207079.931239


In [516]:
#convert the "Date column to datetime format"
    # Raw date data 
Austin_df['Date'] = pd.to_datetime(Austin_df['Date'])
    # Date  data converted into an ordinal value 
Austin_df['ordinal_date'] = Austin_df['Date'].apply(lambda x: x.toordinal())
    # Index used as X value


# Split the data into training and testing sets

X = Austin_df['ordinal_date'].values.reshape(-1, 1)
y = Austin_df['ZHVI'].values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

In [517]:
# Initialize and fit the data to the chosen model "IE Linear Regression"
model = LinearRegression()
model.fit(X_train,y_train)

In [518]:
# Create the predictions variable using .predict
predictions = model.predict(X_test)

# print both the predictions and y_test as refereence.
# print(X)

In [519]:
# Print Austin filtered data with the dates as the x value, and the ZHVI as the y value
Austin_Scatter = Austin_df.hvplot.scatter(
    x = 'Date',
    y = 'ZHVI')
Austin_Scatter


In [520]:
# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's slope: [[115.09351766]]
Model's y-intercept: [-84519604.0544954]
Model's formula: y = [-84519604.0544954] + [115.09351766]X


In [521]:
# Display the formula to predict the salary for a person with 7 years of experience
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the salary for a person with 7 years of experience
y_7 = model.intercept_ + model.coef_[0] * 736725

# Display the prediction
print(f"Predicted salary for a person with 7 years of experience: ${y_7}")

Model's formula: y = [-84519604.0544954] + [115.09351766] * 7
Predicted salary for a person with 7 years of experience: $[272667.74686472]


In [522]:
Austin_df['Predictions'] = model.intercept_ + model.coef_[0] * Austin_df['ordinal_date'].astype(int)
Austin_Linear = Austin_df.hvplot.line(x='Date', y='Predictions', color='Red')
Austin_Linear

In [523]:
(Austin_Scatter*Austin_Linear)

# Linear Regression All cities
##### "Data modeled from last five years"

In [524]:
def save_city_plots(city_df, city_name, slope, intercept):
    plt.figure(figsize=(12, 6))
    
    # Scatter plot of the actual data
    plt.scatter(city_df['Date'], city_df['ZHVI'], color='blue', label='Actual ZHVI')
    
    # Linear regression line
    plt.plot(city_df['Date'], city_df['Predictions'], color='red', linestyle='-', label='Predicted ZHVI')
    
    # Annotate with the regression formula
    formula_text = f"y = {intercept[0]:.2f} + {slope[0][0]:.2f}X"
    plt.text(0.05, 0.95, formula_text, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='red', alpha=0.5))
    
    # Title, labels, and legend
    plt.title(f"{city_name} ZHVI Over Time")
    plt.xlabel('Date')
    plt.ylabel('ZHVI')
    plt.legend(loc='lower right')
    
    # Save the figure
    output_path = os.path.join(output_dir, f"{city_name.replace(' ', '_').replace(',', '')}_plot.png")
    plt.savefig(output_path, format='png')
    plt.close()
    return output_path

# Set the relative directory path
relative_output_dir = "data_files/city_plots/2018_Linear_Regression/"

# Build the full path
output_dir = os.path.join(os.getcwd(), relative_output_dir)

# Ensure the directory exists, and if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Integrated loop for model training and visualization
saved_files_combined = []
for city in cities:
    # Preprocess the data
    city_df = region_dfs[city].copy()
    city_df['Date'] = pd.to_datetime(city_df['Date'])
    city_df['ordinal_date'] = city_df['Date'].apply(lambda x: x.toordinal())
    city_df = city_df.dropna(subset=['ZHVI'])

    # Split data and train model
    X = city_df['ordinal_date'].values.reshape(-1, 1)
    y = city_df['ZHVI'].values.reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Add predictions to DataFrame
    city_df['Predictions'] = model.predict(city_df['ordinal_date'].values.reshape(-1, 1))

    # # Display model details
    # print(f"{city} - Model's slope: {model.coef_}")
    # print(f"{city} - Model's y-intercept: {model.intercept_}")
    # print(f"{city} - Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

    # Visualize and save plot
    file_path = save_city_plots(city_df, city, model.coef_, model.intercept_)
    saved_files_combined.append(file_path)

saved_files_combined


['c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/Austin_TX_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/El_Paso_TX_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/Dallas_TX_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/Houston_TX_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/San_Antonio_TX_plot.png']

## List for future ZHVI Loop Linear Regression

In [525]:
# # Create an empty DataFrame to store future predictions for all cities
# future_df = pd.DataFrame({'Input_Date_Ordinal': dates_monthly_adjusted})
# future_df['Input_Date'] = future_df['Input_Date_Ordinal'].apply(dt.datetime.fromordinal)

# # Iterate over each city to train the model and make predictions
# for city in cities:
#     # Preprocess the data
#     city_df = region_dfs[city].copy()
#     city_df['Date'] = pd.to_datetime(city_df['Date'])
#     city_df['ordinal_date'] = city_df['Date'].apply(lambda x: x.toordinal())
    
#     # Split data and train model
#     X = city_df['ordinal_date'].values.reshape(-1, 1)
#     y = city_df['ZHVI'].values.reshape(-1, 1)
#     model = LinearRegression()
#     model.fit(X, y)
    
#     # Predict future values and store in the DataFrame
#     future_predictions = [float(model.predict([[date]])) for date in dates_monthly_adjusted]
#     future_df[city] = future_predictions

# future_df.head()


# # Starting date ordinal based on the last date in the pivot_df
# current_date_ordinal = pivot_df.index[-1].toordinal()

# # Generate a list of dates incremented by one month for 60 iterations
# dates_monthly_adjusted = [(dt.datetime.fromordinal(current_date_ordinal) + pd.DateOffset(months=i)).toordinal() 
#                           for i in range(60)]

# # Proceeding with the prediction process for each city
# # Create an empty DataFrame to store future predictions for all cities
# future_df = pd.DataFrame({'Input_Date_Ordinal': dates_monthly_adjusted})
# future_df['Input_Date'] = future_df['Input_Date_Ordinal'].apply(dt.datetime.fromordinal)

# # Iterate over each city to train the model and make predictions
# for city in cities:
#     # Preprocess the data
#     city_df = region_dfs[city].copy()
#     city_df['Date'] = pd.to_datetime(city_df['Date'])
#     city_df['ordinal_date'] = city_df['Date'].apply(lambda x: x.toordinal())
    
#     # Split data and train model
#     X = city_df['ordinal_date'].values.reshape(-1, 1)
#     y = city_df['ZHVI'].values.reshape(-1, 1)
#     model = LinearRegression()
#     model.fit(X, y)
    
#     # Predict future values and store in the DataFrame
#     future_predictions = [float(model.predict([[date]])) for date in dates_monthly_adjusted]
#     future_df[city] = future_predictions

# future_df.head()


# # Generate the list of dates again
# dates_monthly_adjusted = [(dt.datetime.fromordinal(current_date_ordinal) + pd.DateOffset(months=i)).toordinal() 
#                           for i in range(60)]

# # Proceeding with the prediction process for each city
# # Create an empty DataFrame to store future predictions for all cities
# future_df = pd.DataFrame({'Input_Date_Ordinal': dates_monthly_adjusted})
# future_df['Input_Date'] = future_df['Input_Date_Ordinal'].apply(dt.datetime.fromordinal)

# # Iterate over each city to train the model and make predictions
# for city in cities:
#     # Preprocess the data
#     city_df = region_dfs[city].copy()
#     city_df['Date'] = pd.to_datetime(city_df['Date'])
#     city_df['ordinal_date'] = city_df['Date'].apply(lambda x: x.toordinal())
    
#     # Split data and train model
#     X = city_df['ordinal_date'].values.reshape(-1, 1)
#     y = city_df['ZHVI'].values.reshape(-1, 1)
#     model = LinearRegression()
#     model.fit(X, y)
    
#     # Predict future values and store in the DataFrame
#     future_predictions = [float(model.predict([[date]])) for date in dates_monthly_adjusted]
#     future_df[city] = future_predictions

# future_df.head()


# # Iterate over each city to train the model and make predictions using the pivot_df directly
# for city in cities:
#     # Preprocess the data
#     city_series = pivot_df[city].dropna()
#     city_dates = city_series.index.to_series().apply(lambda x: x.toordinal()).values.reshape(-1, 1)
#     city_values = city_series.values.reshape(-1, 1)
    
#     # Train model
#     model = LinearRegression()
#     model.fit(city_dates, city_values)
    
#     # Predict future values and store in the DataFrame
#     future_predictions = [float(model.predict([[date]])) for date in dates_monthly_adjusted]
#     future_df[city] = future_predictions

# future_df.head()


In [526]:
# Refining the code using list comprehensions and direct DataFrame operations

# Starting date ordinal
current_date_ordinal = 738732

# Generate a list of dates incremented by one month for 60 iterations
dates_monthly_adjusted = [(dt.datetime.fromordinal(current_date_ordinal) + pd.DateOffset(months=i)).toordinal() 
                          for i in range(60)]

# Adjusting the calculation of predicted values to ensure scalar outputs
outputs_monthly_adjusted_scalar = [float(model.intercept_ + model.coef_[0] * date) for date in dates_monthly_adjusted]

# Create a DataFrame directly from the lists with scalar outputs
df_monthly_refined_scalar = pd.DataFrame({
    'Input_Date_Ordinal': dates_monthly_adjusted,
    'Predicted_Value': outputs_monthly_adjusted_scalar
})

# Convert the ordinal dates to DateTime format for better readability
df_monthly_refined_scalar['Input_Date'] = df_monthly_refined_scalar['Input_Date_Ordinal'].apply(dt.datetime.fromordinal)

df_monthly_refined_scalar.head()  # Display the first few rows of the refined DataFrame with scalar outputs

Unnamed: 0,Input_Date_Ordinal,Predicted_Value,Input_Date
0,738732,279652.221397,2023-07-31
1,738763,281190.486818,2023-08-31
2,738793,282679.130775,2023-09-30
3,738824,284217.396196,2023-10-31
4,738854,285706.040153,2023-11-30


# Lagged features Linear Regression all cities
#### Model trained on all avalible data, visualized on an 80/20 split showing last 5 years

In [494]:
def save_lagged_plots(city_df, city_name, y_test, y_pred, r2, output_dir):
    """
    Visualize and save the lagged feature plots for a given city.
    """
    plt.figure(figsize=(14, 7))
    
    # Plot actual vs predicted values
    plt.plot(city_df.index, y_test, label='Actual', color='blue')
    plt.plot(city_df.index, y_pred, label='Predicted', color='red', linestyle='--')
    
    # Title, labels, and legend
    plt.title(f'Actual vs Predicted ZHVI for {city_name}')
    plt.xlabel('Date')
    plt.ylabel('ZHVI')
    plt.legend(loc='lower right')
    plt.text(0.05, 0.95, f"R-squared = {r2:.2f}", transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='red', alpha=0.5))
    plt.grid(True)
    
    # Save the figure
    output_path = os.path.join(output_dir, f"{city_name.replace(' ', '_').replace(',', '')}_lagged_plot.png")
    plt.savefig(output_path, format='png')
    plt.close()
    return output_path

# Set the relative directory path
relative_output_dir = "data_files/city_plots/Lagged_Linear_Regression/"

# Build the full path
full_output_dir = os.path.join(os.getcwd(), relative_output_dir)

# Ensure the directory exists, and if not, create it
if not os.path.exists(full_output_dir):
    os.makedirs(full_output_dir)

# Storing the paths of saved plots
saved_files = []

# Loop through each city to train the models and save the plots
for city in cities:
    # Process the data for the current city
    city_data = pivot_df[[city]].copy()

    # Create lagged features
    city_data["Lag_1"] = city_data[city].shift(1)
    city_data["Lag_2"] = city_data[city].shift(2)
    city_data["Lag_3"] = city_data[city].shift(3)
    city_data = city_data.dropna()

    # Features and target variable
    X = city_data[[city, "Lag_1", "Lag_2", "Lag_3"]]
    y = city_data[city].shift(-1)  # Predicting the next month's value

    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X[:-1], y.dropna(), test_size=0.2, shuffle=False)

    # Model training
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # R-squared calculation
    r2 = r2_score(y_test, y_pred)
    
    # Visualize and save the plots
    file_path = save_lagged_plots(X_test, city, y_test, y_pred, r2, output_dir)
    saved_files.append(file_path)

saved_files


['c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/Austin_TX_lagged_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/El_Paso_TX_lagged_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/Dallas_TX_lagged_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/Houston_TX_lagged_plot.png',
 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/2018_Linear_Regression/San_Antonio_TX_lagged_plot.png']

## List for future ZHVI Loop Lagged Linear Regression

In [502]:
future_predictions = {}
future_dates = pd.date_range(start=pivot_df.index[-1] + pd.DateOffset(months=1), periods=60, freq='M')

for city in cities:
    city_data = pivot_df[[city]].copy()
    
    # Create lagged features
    city_data["Lag_1"] = city_data[city].shift(1)
    city_data["Lag_2"] = city_data[city].shift(2)
    city_data["Lag_3"] = city_data[city].shift(3)
    city_data = city_data.dropna()
    
    # Extract the last values
    future_vals = [city_data[city].iloc[-1], city_data['Lag_1'].iloc[-1], city_data['Lag_2'].iloc[-1]]
    predictions = []
    for _ in range(60):
        pred = models[city].predict([future_vals[-3:]])
        predictions.append(pred[0])
        future_vals.append(pred[0])
    future_predictions[city] = predictions

# Create a DataFrame for the predicted values
future_lagged_linear_regression_df = pd.DataFrame(future_predictions, index=future_dates)
future_lagged_linear_regression_df.head()




Unnamed: 0,"Austin, TX","El Paso, TX","Dallas, TX","Houston, TX","San Antonio, TX"
2023-08-31,477825.651843,208368.871072,375111.086085,305254.749863,293738.398116
2023-09-30,477619.553638,209173.591737,375178.068313,305060.928155,293771.487522
2023-10-31,484566.628596,203152.55262,366106.623346,298960.459028,290904.81093
2023-11-30,480560.878447,205447.244506,371798.667338,303597.328386,292770.373968
2023-12-31,467458.398932,216901.353425,389368.72675,314333.153203,298244.842081


# Polynomial Features Model
### 

In [495]:
# Define the degree of the polynomial function
degree = 2

# List to store R-squared scores for each city
r2_scores = {}

def save_polynomial_plots(X_test, y_test, y_pred, city, r2, output_directory):
    plt.figure(figsize=(14, 7))
    plt.plot(X_test.index, y_test, label='Actual', color='blue')
    plt.plot(X_test.index, y_pred, label='Predicted', color='red', linestyle='--')
    plt.title(f'Actual vs Predicted ZHVI for {city}')
    plt.xlabel('Date')
    plt.ylabel('ZHVI')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.text(0.05, 0.95, f"R-squared = {r2:.2f}", transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='red', alpha=0.5))
    
    # Save the figure
    file_name = f"{city.replace(' ', '_').replace(',', '')}_PolyRegression.png"
    output_path = os.path.join(output_directory, file_name)
    plt.savefig(output_path, format='png')
    plt.close()
    return output_path

# Loop through each city and save the plots
saved_files_poly = []

# Loop through each city
for city in cities:
    # Process the data for the current city
    city_data = pivot_df[[city]].copy()

    # Create lagged features
    for lag in range(1, 4):
        city_data[f"Lag_{lag}"] = city_data[city].shift(lag)

    # Drop rows with NaN values (due to lagged features)
    city_data = city_data.dropna()

    # Features and target variable
    X = city_data.drop(columns=[city])
    y = city_data[city].shift(-1)  # We are predicting the next month's value

    # Drop the last row since it has NaN for y
    X = X[:-1]
    y = y.dropna()

    # Split the data into training and testing sets (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Create a polynomial regression model
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate the R-squared score and store
    r2 = r2_score(y_test, y_pred)
    r2_scores[city] = r2
    # Save the plot
    file_path = save_polynomial_plots(X_test, y_test, y_pred, city, r2, full_output_dir)
    saved_files_poly.append(file_path)

print(saved_files_poly)

['c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/Lagged_Linear_Regression/Austin_TX_PolyRegression.png', 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/Lagged_Linear_Regression/El_Paso_TX_PolyRegression.png', 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/Lagged_Linear_Regression/Dallas_TX_PolyRegression.png', 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/Lagged_Linear_Regression/Houston_TX_PolyRegression.png', 'c:\\Users\\perry\\OneDrive\\Desktop\\MyRepos\\project4-team1\\data_files/city_plots/Lagged_Linear_Regression/San_Antonio_TX_PolyRegression.png']


# Attempt at RFR using GridSearchCV and TimeSeriesSplit 

In [420]:
def train_best_rfr_for_city(pivot_df, city, max_lags=6):
    best_r2 = float('-inf')
    best_model = None
    best_lags = None
    best_params = {}
    
    param_grid = {
        'n_estimators': [10, 50, 100],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }
    
    tscv = TimeSeriesSplit(n_splits=5)
    
    for lags in range(1, max_lags + 1):
        city_data = pivot_df[[city]].copy()
        for i in range(1, lags + 1):
            city_data[f"Lag_{i}"] = city_data[city].shift(i)
        
        city_data = city_data.dropna()
        X = city_data.drop(columns=[city])
        y = city_data[city].shift(-1).dropna()
        X = X.iloc[:-1]
        
        train_size = int(len(X) * 0.8)
        X_train, X_test = X[:train_size], X[train_size:]
        y_train, y_test = y[:train_size], y[train_size:]
        
        rf = RandomForestRegressor()
        grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv, n_jobs=-1, verbose=0)
        grid_search.fit(X_train, y_train)
        
        best_rf = grid_search.best_estimator_
        y_pred = best_rf.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        
        if r2 > best_r2:
            best_r2 = r2
            best_model = best_rf
            best_lags = lags
            best_params = best_rf.get_params()
    
    return best_model, best_r2, best_lags, best_params


In [421]:
cities = ["Austin, TX", "El Paso, TX", "Dallas, TX", "Houston, TX", "San Antonio, TX"]
results = {}

for city in cities:
    model, r2, lags, params = train_best_rfr_for_city(pivot_df, city)
    results[city] = {"R-squared": r2, "Lags": lags, "Best Model Parameters": params}


405 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
219 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\perry\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\perry\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\perry\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\perry\AppData\Local\Programs\Python\Python311\L

In [425]:
import pprint

pprint.pprint(results)


{'Austin, TX': {'Best Model Parameters': {'bootstrap': True,
                                          'ccp_alpha': 0.0,
                                          'criterion': 'squared_error',
                                          'max_depth': 10,
                                          'max_features': 'sqrt',
                                          'max_leaf_nodes': None,
                                          'max_samples': None,
                                          'min_impurity_decrease': 0.0,
                                          'min_samples_leaf': 1,
                                          'min_samples_split': 2,
                                          'min_weight_fraction_leaf': 0.0,
                                          'n_estimators': 10,
                                          'n_jobs': None,
                                          'oob_score': False,
                                          'random_state': None,
                             