In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle

In [None]:
taxi_df = pd.read_csv('month.csv')
real_estate_df = pd.read_csv('Real_Estate.csv')

In [None]:
taxi_df.head()

In [None]:
real_estate_df.head()

In [None]:
# Create a new DataFrame for the grouped and aggregated data
grouped_real_estate_df = real_estate_df.groupby(['YEAR', 'LocationID'])['FULLVAL'].sum().reset_index()
grouped_real_estate_df.head()

taxi_df['Date'] = pd.to_datetime(taxi_df["Date"])
taxi_df["YEAR"] = taxi_df['Date'].dt.year

grouped_taxi_df = taxi_df.groupby(['LocationID', 'YEAR']).agg({
    'PU_passenger': 'sum',
    'PUTripDistance_count': 'sum',
    'PickUp_count': 'sum',
    'PU_fare': 'sum',
    'DO_passenger': 'sum',
    'DropOff_count': 'sum',
    'DO_fare': 'sum',
    'DOT_fare': 'sum',
    'DOTripDistance_count': 'sum',
}).reset_index()

taxi_data_filtered = grouped_taxi_df[(grouped_taxi_df['YEAR'] >= 2017) &
                                     (grouped_taxi_df['YEAR'] <= 2019)]
real_estate_filtered = grouped_real_estate_df[(grouped_real_estate_df['YEAR'] >= 2017) & 
                                             (grouped_real_estate_df['YEAR'] <= 2019)]

#merging the two filtered data sets
merged_df = pd.merge(taxi_data_filtered, real_estate_filtered, on= ['LocationID', 'YEAR'], 
                     suffixes = ('_taxi', '_real_estate'))

In [None]:
merged_df.head()

In [None]:
merged_df['passenger'] = merged_df['DO_passenger'] + merged_df['PU_passenger']
merged_df['fare'] = merged_df['DO_fare'] + merged_df['PU_fare']

In [None]:
X = merged_df[['passenger', 'fare', 'DO_passenger', 'DropOff_count']]
y = merged_df['FULLVAL']

# Add a constant to the independent variables matrix
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X)
results = model.fit()

# Print the summary
print(results.summary())


1. **Dep. Variable**: The dependent variable in the regression analysis is `FULLVAL`.

2. **Model**: The model is an Ordinary Least Squares (OLS) regression.

3. **Method**: The method used for regression is Least Squares.

4. **Date and Time**: The date and time when the analysis was performed.

5. **No. Observations**: The number of observations or data points used in the regression analysis is 132.

6. **R-squared**: The coefficient of determination (R-squared) measures how well the independent variables explain the variability in the dependent variable. In this case, it's 0.840, which means that approximately 84% of the variability in `FULLVAL` can be explained by the independent variables in the model.

7. **Adj. R-squared**: The adjusted R-squared adjusts the R-squared value based on the number of independent variables in the model. It's 0.835 here.

8. **F-statistic**: The F-statistic tests the overall significance of the model. A larger F-statistic suggests that at least one independent variable is significantly related to the dependent variable. The value here is 167.2, and the associated probability (Prob (F-statistic)) is very close to zero, indicating that the model as a whole is statistically significant.

9. **P-values (P>|t|)**: For each independent variable, the p-value tests the null hypothesis that the coefficient of the variable is zero (i.e., the variable has no effect on the dependent variable). If the p-value is small (typically below 0.05), you can reject the null hypothesis and conclude that the variable is statistically significant. In this case, all four independent variables have very small p-values, indicating their significance.

10. **Coefficient (coef)**: The coefficients represent the estimated change in the dependent variable for a one-unit change in the corresponding independent variable, holding other variables constant. For example, for the 'passenger' variable, a one-unit increase in 'passenger' is associated with an estimated decrease of approximately $3497 in 'FULLVAL', when other variables are held constant.

11. **Standard Error (std err)**: This measures the variability of the coefficient estimate. Smaller standard errors indicate more precise estimates.

12. **t-statistic**: The t-statistic is the coefficient divided by its standard error. It measures the number of standard deviations the coefficient estimate is away from zero. Larger absolute t-values suggest stronger evidence against the null hypothesis.

13. **Omnibus, Prob(Omnibus), Skew, Kurtosis**: These are tests and statistics related to the distribution of residuals. The Omnibus test tests the normality of residuals. Prob(Omnibus) is the associated p-value. Skew measures the symmetry of the residuals distribution, and Kurtosis measures the "tailedness" of the distribution. Low p-values in Omnibus and high values of Skew and Kurtosis may suggest that the residuals are not normally distributed.

14. **Durbin-Watson**: This test helps detect the presence of autocorrelation (dependence between residuals at different time points). The value here is close to 2, which suggests a lack of significant positive autocorrelation.

15. **Cond. No. (Condition Number)**: This tests multicollinearity, which is the presence of high correlation between independent variables. A high condition number may indicate multicollinearity.



In [None]:
unique_location_ids = merged_df['LocationID'].unique()
# Create an empty dictionary
coef_dict = {}

# Loop over each unique LocationID
for location_id in merged_df['LocationID'].unique():
    # Filter data for the current LocationID
    group_data = merged_df[merged_df['LocationID'] == location_id]
    
    # Define predictor variables (X) and target variable (y)
    X = group_data[['passenger', 'fare', 'DO_passenger', 'DropOff_count']]
    y = group_data['FULLVAL']
    
    # Add a constant to the predictor variables
    X = sm.add_constant(X)
    
    # Create and fit the OLS model
    model = sm.OLS(y, X)
    results = model.fit()
    
    # Store the regression coefficients in the dictionary
    coef_dict[location_id] = results.params


In [None]:
df_2020 = pd.read_csv('Month_2020.csv')

In [None]:
df_2020['passenger'] = df_2020['PU_passenger'] + df_2020['DO_passenger']
df_2020['fare'] = df_2020['PU_fare'] + df_2020['DO_fare']

In [None]:
# Convert Date column to datetime type
df_2020['Date'] = pd.to_datetime(df_2020['Date'])

# Record the error rates of each model
avg_error_rates = []
min_error_rates = []

values_to_exclude = []

# Obtain a unique LocationID list
unique_location_ids = df_2020['LocationID'].unique()

# Use the delete function from numpy to exclude specific values.
updated_location_ids = np.delete(unique_location_ids, np.where(np.isin(unique_location_ids, values_to_exclude)))

# Define ARIMA model parameters
order = (1, 1, 2)  # (AR order, difference order, MA order)

# Predict and plot data for each LocationID
for loc_id in unique_location_ids:
    # Filter data based on LocationID
    loc_data = df_2020[df_2020['LocationID'] == loc_id]

    # Split the dataset into training and testing data
    train_size = int(len(loc_data) * 0.8)
    train_data = loc_data.iloc[:train_size]
    test_data = loc_data.iloc[train_size:]

    # Extracting training data and testing data
    train_passenger = train_data['passenger'].values
    train_dropoff_count = train_data['DropOff_count'].values

    train_features = np.column_stack((train_passenger, train_dropoff_count))
    train_target = train_data['fare'].values

    # Train an ARIMA model
    model = sm.tsa.ARIMA(train_target, exog=train_features, order=order)
    model_fit = model.fit()

    # Make predictions
    test_features = np.column_stack((test_data['passenger'].values, test_data['DropOff_count'].values))
    test_predictions_fare = model_fit.predict(start=train_size, end=len(loc_data) - 1, exog=test_features)

    # Calculate the error rate for the predictions
    error_rate = np.abs(test_predictions_fare - test_data['fare'].values) / test_data['fare'].values
    avg_error_rate = np.mean(error_rate)

    avg_error_rates.append(avg_error_rate)
    min_error_rates.append(np.min(error_rate))

    # Draw Line chart
    plt.plot(train_data['Date'], train_target, label='Train Actual')
    plt.plot(test_data['Date'], test_data['fare'].values, label='Test Actual')
    plt.plot(test_data['Date'], test_predictions_fare, label='Test Predictions')
    plt.xlabel('Date')
    plt.ylabel('fare')
    plt.title(f'LocationID {loc_id} - fare Forecast\nAverage Error Rate: {avg_error_rate:.2%}')
    plt.legend()
    plt.show()

# Print the average and minimum error rates
print("Average Error Rates:", avg_error_rates)
print("Minimum Error Rates:", min_error_rates)


In [None]:
def calculate_mean_without_extremes(arr):
    if len(arr) <= 2:
        return None  # The number of elements in the array is less than or equal to 2; the maximum and minimum values cannot be discarded

    # Sort the array
    sorted_arr = sorted(arr)

    # Calculate the trimmed mean by excluding the first and last elements
    trimmed_mean = sum(sorted_arr[1:-1]) / (len(sorted_arr) - 2)
    return trimmed_mean

print("Model best-case average error rate:",str(calculate_mean_without_extremes(min_error_rates)))
print("The greatest model error rate：", str(min(avg_error_rates)))




In [None]:
values_to_exclude = []

# Get a list of unique LocationIDs
unique_locs = df_2020['LocationID'].unique()

# Use numpy to exclude specific values
remaining_locs = np.delete(unique_locs, np.where(np.isin(unique_locs, values_to_exclude)))

# Define ARIMA model parameters
order = (2, 1, 2)  # (AR order, difference order, MA order)

# Create a dictionary to store models
arima_models = {}

# Predict and plot data for each LocationID
for loc_id in remaining_locs:
    # Filter data based on LocationID
    loc_data = df_2020[df_2020['LocationID'] == loc_id]

    # Extract training data and testing data
    train_passenger = loc_data['passenger'].values
    train_DOcount = loc_data['DropOff_count'].values
    
    train_features = np.column_stack((train_passenger, train_DOcount))
    train_target = loc_data['DO_passenger'].values

    # Train an ARIMA model
    model = ARIMA(train_target, exog=train_features, order=order)
    model_fit = model.fit()
    
    # Add the model to the dictionary
    arima_models[loc_id] = model_fit


In [None]:
# Open the file 'Task_4.py' in read mode
with open('Task_4.py', 'r') as f:
    # Read the content of the 'Task_4.py' file and store it in the variable 'notebook_code'
    notebook_code = f.read()

# Create the file 'Task_4.pkl' in binary write mode
with open('Task_4.pkl', 'wb') as f:
    # Use the 'pickle.dump()' function to serialize and save the content of 'notebook_code'
    # into the pickle file 'Task_4.pkl'
    pickle.dump(notebook_code, f)
