In [1]:
import pandas as pd
import pickle
from statsmodels.tsa.arima.model import ARIMA
import numpy as np

## Part A: Predicting Area Traffic and Business Activity using Taxi Data

In [2]:
# Reading a dataset
data = pd.read_csv('2020-2021month.csv')

- #### User Input Items

In [3]:
# Define the start time of the forecast and the number of months of the forecast
# The user can enter
location_id = 4
start_time = pd.to_datetime('2023-12-01')
num_months = 3

####  Drop_Off count

In [4]:
with open('model_DO.pkl', 'rb') as f:
    models = pickle.load(f)

# Get the model corresponding to the LocationID
model = models[location_id]

# Get the historical data needed for the model, either training data or other historical data
# In this example, we use the training data as historical data
location_data = data[data['LocationID'] == location_id]
historical_data = location_data['DropOff_count'].values

# Calculate the end of the forecast based on the start of the forecast and the number of months in the forecast
end_time = start_time + pd.DateOffset(months=num_months)

# Use the model to make predictions and get future data
predictions_DO = model.predict(start=len(historical_data), end=len(historical_data) + num_months - 1)

# Print prediction results
print(f'Predictions for LocationID {location_id} from {start_time} to {end_time}:')
for i, prediction in enumerate(predictions_DO):
    prediction_date = start_time + pd.DateOffset(months=i)
    print(f'{prediction_date}: {prediction}')


Predictions for LocationID 4 from 2023-12-01 00:00:00 to 2024-03-01 00:00:00:
2023-12-01 00:00:00: 13271.823352004822
2024-01-01 00:00:00: 13493.18042524825
2024-02-01 00:00:00: 13975.820082990753


#### Pick_UP count

In [5]:
with open('model_PU.pkl', 'rb') as f:
    models = pickle.load(f)

# Get the model corresponding to the LocationID
model = models[location_id]

# Get the historical data needed for the model, either training data or other historical data
# In this example, we use the training data as historical data
location_data = data[data['LocationID'] == location_id]
historical_data = location_data['PickUp_count'].values

# Calculate the end of the forecast based on the start of the forecast and the number of months in the forecast
end_time = start_time + pd.DateOffset(months=num_months)

# Use the model to make predictions and get future data
predictions_PU = model.predict(start=len(historical_data), end=len(historical_data) + num_months - 1)

times = []
# Print prediction results
print(f'Predictions for LocationID {location_id} from {start_time} to {end_time}:')
for i, prediction in enumerate(predictions_PU):
    prediction_date = start_time + pd.DateOffset(months=i)
    times.append(prediction_date)
    print(f'{prediction_date}: {prediction}')


Predictions for LocationID 4 from 2023-12-01 00:00:00 to 2024-03-01 00:00:00:
2023-12-01 00:00:00: 3121.4483169757746
2024-01-01 00:00:00: 2959.856103029276
2024-02-01 00:00:00: 2877.354792466138


#### Use the predicted values above to determine the number of passengers

In [6]:
# Load the trained models from the .pkl file
with open('models_Passenger.pkl', 'rb') as f:
    models_dict = pickle.load(f)

# Get the model for the selected LocationID from the models_dict
model_for_location = models_dict[location_id]

# Prepare the exogenous features for prediction (future values of DropOff_count and PickUp_count)
future_times = times
future_features = pd.DataFrame({'DropOff_count': predictions_DO,  # Replace with the desired future values
                                'PickUp_count': predictions_PU})    # Replace with the desired future values

# Make predictions for the future years using the ARIMA model
predictions = model_for_location.predict(start=len(future_features), end=len(future_features) + len(future_times) - 1,
                                         exog=future_features)

# Print the predictions
print("Predicted 'passenger' values for LocationID", location_id, "for years", future_times, ":", predictions)


Predicted 'passenger' values for LocationID 4 for years [Timestamp('2023-12-01 00:00:00'), Timestamp('2024-01-01 00:00:00'), Timestamp('2024-02-01 00:00:00')] : [13544.00801531 11634.87827421 11165.56697285]


## Part B: Forecasting the Overall Trend of Real Estate Prices using Linear Features of Real Estate Total Value:

- #### User Input Items

In [7]:
year = [2023, 2024, 2025]
location_id = 4

In [8]:


# Load the models from the .pkl file
with open('models_RealEstate.pkl', 'rb') as f:
    models_dict = pickle.load(f)



# Get the model for the specific LocationID
model_for_location = models_dict[location_id]

# Define the future years for which you want to make predictions
future_years = np.array(year).reshape(-1, 1)

# Make predictions for the future years
predictions = model_for_location.predict(future_years)

# Get the most recent 'FULLVAL' value for the given LocationID
most_recent_fullval = model_for_location.predict(future_years[0].reshape(-1, 1))

# Calculate the growth rate as a percentage
growth_rate = ((predictions - most_recent_fullval) / most_recent_fullval) * 100

# Print the predictions and growth rate
for i, year in enumerate(future_years.flatten()):
    print(f"Predicted FULLVAL for LocationID {location_id} in year {year}: {predictions[i]}")
    print(f"Growth Rate for LocationID {location_id} in year {year}: {growth_rate[i]:.2f}%\n")


Predicted FULLVAL for LocationID 4 in year 2023: 3527604623.666687
Growth Rate for LocationID 4 in year 2023: 0.00%

Predicted FULLVAL for LocationID 4 in year 2024: 3699292684.5
Growth Rate for LocationID 4 in year 2024: 4.87%

Predicted FULLVAL for LocationID 4 in year 2025: 3870980745.333313
Growth Rate for LocationID 4 in year 2025: 9.73%



## PartC. Understanding the Impact of Different Variables on Real Estate Prices through OLS Regression Results:

- #### User Input Items

In [9]:
# Define the start time of the forecast and the number of months of the forecast
# The user can enter
location_id = 4
start_time = pd.to_datetime('2024-7-01')
num_months = 12

In [10]:
# Reading the model
with open('Ensembling.pkl', 'rb') as f:
    models_dict = pickle.load(f)

In [11]:
DO_dict = models_dict['DO']
PU_dict = models_dict['PU']
Pa_dict = models_dict['Pa']
fare_dict = models_dict['fare']
DOP_dict = models_dict['DOP']
coef_dict = models_dict['coef']

- Predicted required data.

In [12]:
data = pd.read_csv('2020-2021month.csv')

# DropOff_count

# Get the historical data needed for the model, either training data or other historical data
# In this example, we use the training data as historical data
location_data = data[data['LocationID'] == location_id]
historical_data = location_data['DropOff_count'].values

# Calculate the end of the forecast based on the start of the forecast and the number of months in the forecast
end_time = start_time + pd.DateOffset(months=num_months)

# Use the model to make predictions and get future data
predictions_DO = DO_dict[location_id].predict(start=len(historical_data), end=len(historical_data) + num_months - 1)

# PickUp_count

historical_data = location_data['PickUp_count'].values

# Use the model to make predictions and get future data
predictions_PU = PU_dict[location_id].predict(start=len(historical_data), end=len(historical_data) + num_months - 1)

future_times = []
print(f'Predictions for LocationID {location_id} from {start_time} to {end_time}:')
for i, prediction in enumerate(predictions_DO):
    prediction_date = start_time + pd.DateOffset(months=i)
    future_times.append(prediction_date)
    print(f'{prediction_date}: {prediction}')


Predictions for LocationID 4 from 2024-07-01 00:00:00 to 2025-07-01 00:00:00:
2024-07-01 00:00:00: 13271.823352004822
2024-08-01 00:00:00: 13493.18042524825
2024-09-01 00:00:00: 13975.820082990753
2024-10-01 00:00:00: 14319.421730693837
2024-11-01 00:00:00: 14736.59824990118
2024-12-01 00:00:00: 15114.42973584186
2025-01-01 00:00:00: 15512.888773795103
2025-02-01 00:00:00: 15900.123670812667
2025-03-01 00:00:00: 16293.051179905018
2025-04-01 00:00:00: 16682.68682866883
2025-05-01 00:00:00: 17073.80244676684
2025-06-01 00:00:00: 17463.863806111556


In [13]:
# Passenger

# Prepare the exogenous features for prediction (future values of DropOff_count and PickUp_count)
future_features = pd.DataFrame({'DropOff_count': predictions_DO,  # Replace with the desired future values
                                'PickUp_count': predictions_PU})    # Replace with the desired future values

# Make predictions for the future years using the ARIMA model
prediction_Pa = Pa_dict[location_id].predict(start=0, end=len(future_times)- 1,
                                         exog=future_features)

In [14]:
# fare


# Prepare the exogenous features for prediction (future values of DropOff_count and PickUp_count)
future_features = pd.DataFrame({'passenger': prediction_Pa,  # Replace with the desired future values
                                'DropOff_count': predictions_DO})    # Replace with the desired future values

# Make predictions for the future years using the ARIMA model
prediction_fare = fare_dict[location_id].predict(start=0, end=len(future_times)- 1,
                                         exog=future_features)

In [15]:
# DO_passenger


# Prepare the exogenous features for prediction (future values of DropOff_count and PickUp_count)
future_features = pd.DataFrame({'passenger': prediction_Pa,  # Replace with the desired future values
                                'DropOff_count': predictions_DO})    # Replace with the desired future values

# Make predictions for the future years using the ARIMA model
prediction_DOP = DOP_dict[location_id].predict(start=0, end=len(future_times)- 1,
                                         exog=future_features)

- Read data for the year 2021.

In [16]:
data['Date'] = pd.to_datetime(data['Date'])
filtered_data = data[(data['Date'].dt.year == 2021) & (data['LocationID'] == location_id)]

In [17]:
passenger = filtered_data['PU_passenger'].sum() + filtered_data['DO_passenger'].sum()
fare = filtered_data['PU_fare'].sum() + filtered_data['DO_fare'].sum()
DO_passenger = filtered_data['DO_passenger'].sum()
DropOff_count = filtered_data['DropOff_count'].sum()

- To view the coefficient data.

In [18]:
print(coef_dict[location_id])

const                0.668977
passenger       -11448.001369
fare             -2914.884336
DO_passenger     29514.906651
DropOff_count    34397.668224
dtype: float64


- The impact of taxi data on real estate is positive if the Influence_Factor is positive and negative if the Influence_Factor is negative.

In [20]:
Influence_Factor = ((np.sum(prediction_Pa) - passenger) / passenger) * coef_dict[location_id][1] + \
      ((np.sum(prediction_fare) - fare) / fare) * coef_dict[location_id][2] + \
      ((np.sum(prediction_DOP) - DO_passenger) / DO_passenger) * coef_dict[location_id][3] + \
      ((np.sum(predictions_DO) - DropOff_count) / DropOff_count) * coef_dict[location_id][4] + \
      coef_dict[location_id][0]

In [22]:
print(Influence_Factor)

88464.72141419831
