<a href="https://colab.research.google.com/github/prithesh07/Prob_Stats_Complete_Project/blob/main/Probstats_FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

3.1.1 Dataset Cleaning

In [None]:
import pandas as pd
import numpy as np

# Load Data
csv1 = pd.read_csv('2022-02.csv')
csv2 = pd.read_csv('2022-03.csv')

csv1['month'] = 'feb'
csv2['month'] = 'march'

combined = pd.concat([csv1, csv2], ignore_index=True)
combined['lpep_pickup_datetime'] = pd.to_datetime(combined['lpep_pickup_datetime'])

combined.head()

Unnamed: 0,lpep_pickup_datetime,passenger_count,trip_distance,trip_duration,fare_amount,tip_amount,congestion_surcharge,month
0,2022-02-01 00:20:21,1.0,1.16,249.0,5.5,1.02,0.0,feb
1,2022-02-01 00:32:26,1.0,0.57,185.0,4.5,0.0,0.0,feb
2,2022-02-01 00:17:27,1.0,0.0,1637.0,42.2,0.0,0.0,feb
3,2022-02-01 00:45:37,1.0,16.62,2499.0,49.0,0.0,0.0,feb
4,2022-02-01 00:06:46,1.0,5.97,1400.0,21.0,4.5,2.75,feb


In [None]:
# 3.1.1 Dataset Cleaning

# Remove rows which are not part of feb or march
combined = combined[(combined['lpep_pickup_datetime'].dt.month == 2) | (combined['lpep_pickup_datetime'].dt.month == 3)]

#Delete the rows which have trip_distance equal to 0
combined = combined[combined['trip_distance'] != 0]

assert combined['trip_distance'].notnull().all(), "There are null values in 'trip_distance'"
assert (combined['trip_distance'] > 0).all(), "'trip_distance' values are not greater than 0"

num_cols = combined.select_dtypes(include=[np.number])

for col in num_cols.columns:
    combined.loc[combined[col] < 0, col] = np.nan

combined.loc[combined['trip_duration'] == 0, 'trip_duration'] = np.nan
combined.loc[combined['fare_amount'] == 0, 'fare_amount'] = np.nan

# Check if the first row originally had any NaN values and replace values if so
first_row_idx = combined.index[0]
if combined.loc[first_row_idx].isnull().any():
    combined.loc[first_row_idx, ['passenger_count', 'trip_distance', 'trip_duration', 'fare_amount', 'tip_amount', 'congestion_surcharge']] = [2, 2.42, 398.2, 14.3, 2.0, 0.0]

# Check if the last row originally had any NaN values and replace values if so
last_row_idx = combined.index[-1]
if combined.loc[last_row_idx].isnull().any():
    combined.loc[last_row_idx, ['passenger_count', 'trip_distance', 'trip_duration', 'fare_amount', 'tip_amount', 'congestion_surcharge']] = [1, 5.77, 821.3, 29.3, 4.0, 0.0]

3.1.2 Outlier Detection

In [None]:
# 3.1.2 Outlier Detection
alpha = 1.5

# Function to identify outliers based on Tukey's rule
def identify_outliers(series, alpha=1.5):
    q1 = custom_quantile(series, 0.25)
    q3 = custom_quantile(series, 0.75)
    iqr = q3 - q1

    lower_bound = q1 - alpha * iqr
    upper_bound = q3 + alpha * iqr

    return (series < lower_bound) | (series > upper_bound)

def custom_quantile(series, q):
    sorted_series = series.dropna().sort_values().reset_index(drop=True)
    n = len(sorted_series)

    position = q * (n - 1)

    if position.is_integer():
        return sorted_series[int(position)]
    else:
        lower_index = int(np.floor(position))
        upper_index = int(np.ceil(position))
        return sorted_series[lower_index] + (position - lower_index) * (sorted_series[upper_index] - sorted_series[lower_index])

for col in ["passenger_count", "trip_duration", "fare_amount"]:
    outliers_mask = identify_outliers(combined[col])
    num_outliers = outliers_mask.sum()
    combined.loc[outliers_mask, col] = np.nan

    print(f"Number of elements identified as outlier in {col}: {num_outliers}")

Number of elements identified as outlier in passenger_count: 17987
Number of elements identified as outlier in trip_duration: 7686
Number of elements identified as outlier in fare_amount: 8782


In [None]:
# Check if the first row originally had any NaN values and replace values if so
first_row_idx = combined.index[0]
if combined.loc[first_row_idx].isnull().any():
    combined.loc[first_row_idx, ['passenger_count', 'trip_distance', 'trip_duration', 'fare_amount', 'tip_amount', 'congestion_surcharge']] = [2, 2.42, 398.2, 14.3, 2.0, 0.0]

# Check if the last row originally had any NaN values and replace values if so
last_row_idx = combined.index[-1]
if combined.loc[last_row_idx].isnull().any():
    combined.loc[last_row_idx, ['passenger_count', 'trip_distance', 'trip_duration', 'fare_amount', 'tip_amount', 'congestion_surcharge']] = [1, 5.77, 821.3, 29.3, 4.0, 0.0]

columns_with_nan = [col for col in combined.columns if combined[col].isnull().any()]
combined = combined.sort_values(by='lpep_pickup_datetime').reset_index(drop=True)


3.1.3 Perform linear interpolation of missing data-points

In [None]:
#3.1.3 Perform linear interpolation of missing data-points

# Function to calculate the slope between two points
def calculate_slope(x1, y1, x2, y2):
    if x2 == x1:
        return 0  # Avoid division by zero
    return (y2 - y1) / (x2 - x1)

# Interpolate missing data points for each column with NaN values
for column in columns_with_nan:
    x1 = None
    y1 = None
    x2 = None
    y2 = None

    for idx in range(len(combined)):
        # If it's a non-null value, update the last known time. Using interpolated values as known values for further iterations ensures data continuity and completeness in sparse datasets, allowing for more accurate and iterative refinement of missing values. It simplifies the implementation process, aligning with common practices in data interpolation. This approach maintains the assumption of linear trends, providing consistent and practical estimates throughout the dataset
        if not pd.isna(combined.loc[idx, column]):
            x1 = combined.loc[idx, 'lpep_pickup_datetime'].timestamp()
            y1 = combined.loc[idx, column] / combined.loc[idx, 'trip_distance']
        else:
            # If it's a null value, find the next known point
            for i in range(idx + 1, len(combined)):
                if not pd.isna(combined.loc[i, column]):
                    x2 = combined.loc[i, 'lpep_pickup_datetime'].timestamp()
                    y2 = combined.loc[i, column] / combined.loc[i, 'trip_distance']

                    if(x2 == x1):
                      break

                    # Calculate the slope and interpolate
                    slope = calculate_slope(x1, y1, x2, y2)

                    # Get the current time and calculate interpolation
                    current_time = combined.loc[idx, 'lpep_pickup_datetime'].timestamp()
                    interpolated_value = y1 + slope * (current_time - x1)

                    # Scale by 'trip_distance'
                    current_distance = combined.loc[idx, 'trip_distance']
                    combined.loc[idx, column] = interpolated_value * current_distance

                    break  # Found the next known point, break the loop

# Verify if there are any points that are still NaN
nan_check = combined.isnull().sum()
print("NaN values remaining in each column:")
print(nan_check)


NaN values remaining in each column:
lpep_pickup_datetime     0
passenger_count          4
trip_distance            0
trip_duration           16
fare_amount             29
tip_amount               0
congestion_surcharge     4
month                    0
dtype: int64


In [None]:
#handling cases where x2 was equal to x1
combined = combined.dropna()

# Verify if there are any points that are still NaN
nan_check = combined.isnull().sum()
print("NaN values remaining in each column:")
print(nan_check)

NaN values remaining in each column:
lpep_pickup_datetime    0
passenger_count         0
trip_distance           0
trip_duration           0
fare_amount             0
tip_amount              0
congestion_surcharge    0
month                   0
dtype: int64


3.2 Add total_amount column

In [None]:
#3.2 Add total_amount Column
pd.options.mode.chained_assignment = None
total_amount = (
    combined['fare_amount'] +
    combined['tip_amount'] +
    combined['congestion_surcharge']
).copy()

# Append the total_amount column to the combined DataFrame
combined.loc[:, 'total_amount'] = total_amount
# Check if any NaN values exist in 'total_amount'
assert not combined['total_amount'].isnull().any(), "Some 'total_amount' values are NaN"

combined.head()

Unnamed: 0,lpep_pickup_datetime,passenger_count,trip_distance,trip_duration,fare_amount,tip_amount,congestion_surcharge,month,total_amount
0,2022-02-01 00:01:11,1.0,1.77,527.0,8.5,1.96,0.0,feb,10.46
1,2022-02-01 00:02:18,1.0,7.11,1369.0,24.0,0.0,2.75,feb,26.75
2,2022-02-01 00:05:12,1.0,1.45,508.0,7.0,2.49,0.0,feb,9.49
3,2022-02-01 00:05:12,1.0,1.06,934.0,10.0,0.0,0.0,feb,10.0
4,2022-02-01 00:05:41,1.0,2.47,697.0,10.5,0.0,0.0,feb,10.5


3.3 Performing Wald’s test, Z-test and t-test

In [None]:
from scipy.stats import norm,t
# Check if any NaN values exist in 'total_amount'
assert not combined['total_amount'].isnull().any(), "Some 'total_amount' values are NaN"
critical_value=norm.ppf(0.975)
print (f"Critical value at 95% cI for 2 sided test : {critical_value}")

Critical value at 95% cI for 2 sided test : 1.959963984540054


for trip distance

In [None]:
#wald
feb_trip_distance_mean=combined[combined['month']=='feb']['trip_distance'].mean()
march_trip_distance_mean= combined[combined['month'] =='march']['trip_distance'].mean()
num_feb=combined[combined['month'] == 'feb']['trip_distance'].count()
num_march=combined[combined['month'] == 'march']['trip_distance'].count()
feb_trip_distance_variance=feb_trip_distance_mean/num_feb
march_trip_distance_variance=march_trip_distance_mean/num_march
wald_statistic_trip= (feb_trip_distance_mean -march_trip_distance_mean)/np.sqrt(feb_trip_distance_variance +march_trip_distance_variance)
print(f"Wald's Test Statistic for Trip Distance: {wald_statistic_trip}")
print(f"Significant Difference between 2 months for trip distance: {abs(wald_statistic_trip)>critical_value}")

Wald's Test Statistic for Trip Distance: 592.1477276449855
Significant Difference between 2 months for trip distance: True


In [None]:
# z test
feb_trip_distance_sample_var =combined[combined['month'] =='feb']['trip_distance'].var()
march_trip_distance_sample_var =combined[combined['month']== 'march']['trip_distance'].var()
z_statistic_trip=(feb_trip_distance_mean-march_trip_distance_mean) /np.sqrt((feb_trip_distance_sample_var/num_feb) +(march_trip_distance_sample_var/num_march))

print(f"Z-Test Statistic for Trip Distance:{z_statistic_trip}")
print(f"Significant Difference between 2 months for trip distance: {abs(z_statistic_trip)>critical_value}")

Z-Test Statistic for Trip Distance:1.7691171508898291
Significant Difference between 2 months for trip distance: False


In [None]:
#t test
t_statistic_trip=(feb_trip_distance_mean-march_trip_distance_mean) /np.sqrt((feb_trip_distance_sample_var/num_feb) +(march_trip_distance_sample_var/num_march))
dof=num_feb+num_march-2
t_critical_value_trip =t.ppf(0.975, dof)
print(f"t-Test Statistic for Trip Distance:{t_statistic_trip}")
print(f"Degrees of Freedom:{dof}")
print(f"Critical Value (t-distribution):{t_critical_value_trip}")
print(f"Significant Difference between 2 months for trip distance: {abs(t_statistic_trip)>t_critical_value_trip}")

t-Test Statistic for Trip Distance:1.7691171508898291
Degrees of Freedom:139741
Critical Value (t-distribution):1.959980960885067
Significant Difference between 2 months for trip distance: False


for fare_amount

In [None]:
#wald
feb_fare_amount_mean=combined[combined['month']=='feb']['fare_amount'].mean()
march_fare_amount_mean= combined[combined['month'] =='march']['fare_amount'].mean()
num_feb=combined[combined['month'] == 'feb']['fare_amount'].count()
num_march=combined[combined['month'] == 'march']['fare_amount'].count()
feb_fare_amount_variance=feb_fare_amount_mean/num_feb
march_fare_amount_variance=march_fare_amount_mean/num_march
wald_statistic_fare= (feb_fare_amount_mean -march_fare_amount_mean)/np.sqrt(feb_fare_amount_variance +march_fare_amount_variance)
print(f"Wald's Test Statistic for fare amount: {wald_statistic_fare}")
print(f"Significant Difference between 2 months for fare amount: {abs(wald_statistic_fare)>critical_value}")

Wald's Test Statistic for fare amount: 847.8777503783994
Significant Difference between 2 months for fare amount: True


In [None]:
# z test
feb_fare_amount_sample_var =combined[combined['month'] =='feb']['fare_amount'].var()
march_fare_amount_sample_var =combined[combined['month']== 'march']['fare_amount'].var()
z_statistic_fare=(feb_fare_amount_mean-march_fare_amount_mean) /np.sqrt((feb_fare_amount_sample_var/num_feb) +(march_fare_amount_sample_var/num_march))
print(f"Z-Test Statistic for fare_amount:{z_statistic_fare}")
print(f"Significant Difference between 2 months for fare_amount: {abs(z_statistic_fare)>critical_value}")

Z-Test Statistic for fare_amount:1.1112076805559827
Significant Difference between 2 months for fare_amount: False


In [None]:
#t test
t_statistic_fare=(feb_fare_amount_mean-march_fare_amount_mean) /np.sqrt((feb_fare_amount_sample_var/num_feb) +(march_fare_amount_sample_var/num_march))
dof=num_feb+num_march-2
t_critical_value_fare =t.ppf(0.975, dof)
print(f"t-Test Statistic for fare_amount:{t_statistic_fare}")
print(f"Degrees of Freedom:{dof}")
print(f"Critical Value (t-distribution):{t_critical_value_fare}")
print(f"Significant Difference between 2 months for fare_amount: {abs(t_statistic_fare)>t_critical_value_fare}")

t-Test Statistic for fare_amount:1.1112076805559827
Degrees of Freedom:139741
Critical Value (t-distribution):1.959980960885067
Significant Difference between 2 months for fare_amount: False


3.4 Performing K-S test and Permutation test

In [None]:
import scipy.stats as stats
def ecdf(d):
    l=len(d)
    x =np.sort(d)
    y= np.arange(1,l+1)/l
    return x,y

def ks_1sample(d,dist,mu=None,n=None,p=None,geo=None):
    x,y_emp=ecdf(d)
    if dist=="poisson":
      y_t=stats.poisson.cdf(x, mu)
    elif dist=="geo":
      y_t=stats.geom.cdf(x, geo)
    elif dist=="binom":
      y_t=stats.binom.cdf(x, n, p)
    return np.max(np.abs(y_emp -y_t))

def ks_2sample(d1, d2):
    x1, y1=ecdf(d1)
    x2,y2=ecdf(d2)
    cat_x=np.unique(np.concatenate((x1,x2)))
    ecdf1 = np.interp(cat_x, x1,y1, left=0,right=1)
    ecdf2 = np.interp(cat_x,x2, y2, left=0,right=1)
    ks_stat = np.max(np.abs(ecdf1- ecdf2))
    return ks_stat

2 sample ks test on trip_duration and fare_amount columns

In [None]:
ks_2 =ks_2sample(combined['trip_duration'],combined['fare_amount'])
print(f"Two-sample KS test statistic: {ks_2}")
print (f"is distribution same for the 2 columns: {ks_2<0.05}")

Two-sample KS test statistic: 0.9745234292746019
is distribution same for the 2 columns: False


Permuatation test on trip_duration and fare_amount columns

In [None]:
def permutation_t(d1,d2, n=1000):
  combined_data=np.concatenate([d1, d2])
  obs_diff=np.abs(np.mean(d1)- np.mean(d2))
  cnt=0
  for _ in range(n):
    np.random.shuffle(combined_data)
    permuted_diff =np.abs(np.mean(combined_data[:len(d1)])- np.mean(combined_data[len(d1):]))
    if permuted_diff>=obs_diff:
      cnt +=1
  p_value = cnt/float(n)
  return p_value

In [None]:
p_val=permutation_t(combined['trip_duration'],combined['fare_amount'])
print (f"p value from permuatation test: {p_val} ")
print (f"are 2 distributions same? - {p_val>=0.05}")

p value from permuatation test: 0.0 
are 2 distributions same? - False


1 sample KS tests

In [None]:
#using poison lambda=mean
lambda_poisson=combined['trip_duration'].mean()
D_one_sample_poisson = ks_1sample(combined['fare_amount'],"poisson",mu=lambda_poisson)
print(f"One-sample KS test statistic against Poisson distribution: {D_one_sample_poisson}")
print (f"are 2 distributions same using 1 sample KS test with poisson dist - {D_one_sample_poisson<0.05}")

One-sample KS test statistic against Poisson distribution: 0.9996636683053892
are 2 distributions same using 1 sample KS test with poisson dist - False


In [None]:
#using geometric p=1/mean
from scipy.stats import geom
trip_mean=combined['trip_duration'].mean()
p_geo=1/trip_mean
D_geom = ks_1sample(combined['fare_amount'], "geo",geo=p_geo)
print(f"One-sample KS test statistic against Geometric distribution: {D_geom}")
print (f"are 2 distributions same using 1 sample KS test with geometric dist - {D_geom<0.05}")

One-sample KS test statistic against Geometric distribution: 0.9766217889402966
are 2 distributions same using 1 sample KS test with geometric dist - False


In [None]:
#using binomial
mean_trip_duration=np.mean(combined['trip_duration'])
var_trip_duration=np.var(combined['trip_duration'])
n_binom = mean_trip_duration**2 /(mean_trip_duration-var_trip_duration)
p_binom = mean_trip_duration/n_binom
n_binom=int(n_binom)
D_binom = ks_1sample(combined['fare_amount'],"binom",n=n_binom,p=p_binom)
print(f"One-sample KS test statistic against Binomial distribution: {D_binom}")
print (f"p_hat and n_hat : {p_binom, n_binom}")
print (f"are 2 distributions same using 1 sample KS test with geometric dist - {D_binom<0.05}")

One-sample KS test statistic against Binomial distribution: nan
p_hat and n_hat : (-35018048.52316385, 0)
are 2 distributions same using 1 sample KS test with geometric dist - False


3.5 Linear Regression

In [None]:
#3.5 Linear Regression
# Define function for Linear Regression
def linear_regression(X, y):
    X = np.c_[np.ones(X.shape[0]), X]  # Add intercept term
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
    return beta[0], beta[1:]

# Define functions for MAPE and MSE
def mape(observed, predicted):
    return np.mean(np.abs((observed - predicted) / observed)) * 100

def sse(observed, predicted):
    return np.sum((observed - predicted)**2)

# Separate regression for total_amount vs trip_duration
X_duration = combined[['trip_duration']]
y_duration = combined['total_amount']
beta0_duration, beta1_duration = linear_regression(X_duration.values, y_duration.values)

# Calculate predicted values
y_pred_duration = beta0_duration + beta1_duration * X_duration.values.flatten()

# Calculate SSE and MAPE
SSE_duration = sse(y_duration.values, y_pred_duration)
MAPE_duration = mape(y_duration.values, y_pred_duration)

print("Part 1: Separate Regression Analysis")
print("Total_Amount vs Trip_Duration:")
print("β0:", beta0_duration)
print("β1:", beta1_duration)
print("SSE:", SSE_duration)
print("MAPE:", MAPE_duration)

# Separate regression for total_amount vs trip_distance
X_distance = combined[['trip_distance']]
y_distance = combined['total_amount']
beta0_distance, beta1_distance = linear_regression(X_distance.values, y_distance.values)

# Calculate predicted values
y_pred_distance = beta0_distance + beta1_distance * X_distance.values.flatten()

# Calculate SSE and MAPE
SSE_distance = sse(y_distance.values, y_pred_distance)
MAPE_distance = mape(y_distance.values, y_pred_distance)

print("\nTotal_Amount vs Trip_Distance:")
print("β0:", beta0_distance)
print("β1:", beta1_distance)
print("SSE:", SSE_distance)
print("MAPE:", MAPE_distance)

# Regression for total_amount vs [trip_duration, trip_distance]
X_combined = combined[['trip_duration', 'trip_distance']]
y_combined = combined['total_amount']
beta0_combined, [beta1_combined, beta2_combined] = linear_regression(X_combined.values, y_combined.values)

# Calculate predicted values
y_pred_combined = beta0_combined + beta1_combined * X_combined['trip_duration'] + beta2_combined * X_combined['trip_distance']

# Calculate SSE and MAPE
SSE_combined = sse(y_combined.values, y_pred_combined)
MAPE_combined = mape(y_combined.values, y_pred_combined)

print("\nPart 2: Regression Analysis with Both Variables")
print("Total_Amount vs [Trip_Duration, Trip_Distance]:")
print("β0:", beta0_combined)
print("β1:", beta1_combined)
print("β2:", beta2_combined)
print("SSE:", SSE_combined)
print("MAPE:", MAPE_combined)

Part 1: Separate Regression Analysis
Total_Amount vs Trip_Duration:
β0: 84.52813262089026
β1: [0.00802702]
SSE: 7975436325832.035
MAPE: 849.5331528579829

Total_Amount vs Trip_Distance:
β0: 16.418578423844878
β1: [1.4156114]
SSE: 7358676853566.975
MAPE: 401.53127167993136

Part 2: Regression Analysis with Both Variables
Total_Amount vs [Trip_Duration, Trip_Distance]:
β0: 10.939284545942996
β1: 0.005132881350845376
β2: 1.0914117502665055
SSE: 6671989348351.843
MAPE: 340.630346856321


Overall, considering both Trip_Duration and Trip_Distance together in the regression analysis leads to a different regression fit with lower intercept, adjusted coefficients, and improved model performance in terms of SSE and MAPE compared to separate analyses. This indicates that the joint effect of both variables provides a more comprehensive understanding of their relationship with Total_Amount.

3.6 Time Series Analysis

In [None]:
combined['lpep_pickup_datetime'] = pd.to_datetime(combined['lpep_pickup_datetime'])
combined.set_index('lpep_pickup_datetime', inplace=True)

# Calculate median trip amount for each day
daily_median = combined['total_amount'].resample('D').median()

# Define training data (from 1st date of the first month until the second last date of the second month)
train_data = daily_median[:-1]

# Define observed data (median trip amount of the last day)
observed_data = daily_median[-1:]



In [None]:
def linear_regression(X, y):
    X = np.c_[np.ones(X.shape[0]), X]  # Add intercept term
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
    return beta[0], beta[1:]

# AR(2) prediction
def ar2_predict(data):
    n = len(data)
    if n < 3:
        return None

    # Prepare training data
    y = data[2:]      # Target values starting from the 3rd value
    y_1 = data[1:-1]  # 2nd to second last value
    y_2 = data[:-2]   # 1st to third last value

    # Prepare feature matrix
    X = np.column_stack((y_1, y_2))

    # Perform linear regression
    beta0, betas = linear_regression(X, y)
    beta1, beta2 = betas

    # Use coefficients to predict next value
    y_1_latest = data[-1]  # Last value in training data
    y_2_latest = data[-2]  # Second last value in training data
    return beta0 + beta1 * y_1_latest + beta2 * y_2_latest

# AR(3) prediction
def ar3_predict(data):
    n = len(data)
    if n < 4:
        return None

    # Prepare training data
    y = data[3:]      # Target values starting from the 4th value
    y_1 = data[2:-1]  # 3rd to second last value
    y_2 = data[1:-2]  # 2nd to third last value
    y_3 = data[:-3]   # 1st to fourth last value

    # Prepare feature matrix
    X = np.column_stack((y_1, y_2, y_3))

    # Perform linear regression
    beta0, betas = linear_regression(X, y)
    beta1, beta2, beta3 = betas

    # Use coefficients to predict next value
    y_1_latest = data[-1]  # Last value in training data
    y_2_latest = data[-2]  # Second last value in training data
    y_3_latest = data[-3]  # Third last value in training data
    return beta0 + beta1 * y_1_latest + beta2 * y_2_latest + beta3 * y_3_latest

# Perform predictions
ar2_prediction = ar2_predict(train_data.values)
ar3_prediction = ar3_predict(train_data.values)

In [None]:
alpha = 0.5
ewma_prediction = [train_data[0]]
for i in range(1, len(train_data)):
    prediction = alpha * train_data[i-1] + (1 - alpha) * ewma_prediction[i-1]
    ewma_prediction.append(prediction)

window = 7
def sma_predict(data, window):
    n = len(data)
    if n < window:
        return None
    return np.mean(data[-window:])
sma_prediction = sma_predict(train_data.values, window=7)

# Calculate MAPE for each prediction
mape_ar2 = mape(observed_data.values, ar2_prediction)
mape_ar3 = mape(observed_data.values, ar3_prediction)
mape_ewma = mape(observed_data.values, ewma_prediction)
mape_sma = mape(observed_data.values, sma_prediction)

# Print predictions and MAPE
print("\nTime Series Analysis")
print("AR(2) Prediction:", ar2_prediction)
print("AR(3) Prediction:", ar3_prediction)
print("Last EWMA prediction:", ewma_prediction[-1])
print("SMA Prediction:", sma_prediction)
print("MAPE for AR(2):", mape_ar2)
print("MAPE for AR(3):", mape_ar3)
print("MAPE for EWMA:", mape_ewma)
print("MAPE for SMA:", mape_sma)

# Print actual value for comparison
print(f"Actual Median Total Amount on Last Day: {observed_data.values[0]:.5f}")


Time Series Analysis
AR(2) Prediction: 12.868650487402292
AR(3) Prediction: 12.97692283243535
Last EWMA prediction: 12.706844450953376
SMA Prediction: 12.83
MAPE for AR(2): 4.036909117059721
MAPE for AR(3): 3.229509079527599
MAPE for EWMA: 3.560051281911784
MAPE for SMA: 4.325130499627145
Actual Median Total Amount on Last Day: 13.41000


3.7 Chisquare

In [None]:
#3.7 Chisquare
# Assuming 'combined' is the DataFrame used in your notebook

# Categorize 'tip_amount' and 'passenger_count' into 'high' and 'low' based on median values
median_tip = combined['tip_amount'].median()
median_passenger = combined['passenger_count'].median()

combined['tip_category'] = np.where(combined['tip_amount'] > median_tip, 'high', 'low')
combined['passenger_category'] = np.where(combined['passenger_count'] > median_passenger, 'high', 'low')

# Create a contingency table
contingency_table = pd.crosstab(combined['tip_category'], combined['passenger_category'])

# Convert contingency table to numpy array for calculations
observed = contingency_table.values

# Calculate the expected frequencies
row_totals = observed.sum(axis=1, keepdims=True)
col_totals = observed.sum(axis=0, keepdims=True)
total = observed.sum()

expected = row_totals @ col_totals / total

# Calculate the Chi-Square statistic
chi2_statistic = ((observed - expected) ** 2 / expected).sum()

# Degrees of freedom
dof = (observed.shape[0] - 1) * (observed.shape[1] - 1)

# Approximate the p-value using a numerical approximation for the Chi-Square CDF
def chi2_cdf(x, k):
    # Using the regularized gamma function approximation
    from math import exp, gamma
    if x < 0 or k <= 0:
        return 0.0
    g_k = gamma(k / 2)
    return 1 - (exp(-x / 2) * (x / 2)**(k / 2 - 1)) / g_k

p_value = 1 - chi2_cdf(chi2_statistic, dof)

# Print results
print(f'Chi-Square Statistic: {chi2_statistic}')
print(f'p-value: {p_value}')
print('Contingency Table:')
print(contingency_table)


Chi-Square Statistic: 2002.6242027826795
p-value: 0.0
Contingency Table:
passenger_category   high    low
tip_category                    
high                13425  56344
low                  7488  62486


3.8: Bayesian Test

In [None]:
# Section 3.8: Bayesian Test

# Filter the data for the first month where 0 < trip_distance <= 10
df_first_month = combined[(combined['trip_distance'] > 0) & (combined['trip_distance'] <= 10) & (combined['month'] == 'feb')]

# Define parameters for the prior distribution
m_prior = 2.25
sd_prior = 1.95
sigma = 2
n_samples = 30

# Ensure the DataFrame has enough rows to sample from
if df_first_month.shape[0] < n_samples:
    raise ValueError("Not enough data points in the first month to sample 30 points per sample.")

# Create 30 samples of 30 points each
samples = [df_first_month['trip_distance'].sample(n=n_samples, random_state=i) for i in range(1, 31)]

# Calculate the posterior mean and standard deviation for each sample
posterior_means = []
posterior_sds = []

for sample in samples:
    X_bar = sample.mean()
    n = len(sample)
    posterior_mean = (sd_prior**2 * X_bar + sigma**2 * m_prior) / (sd_prior**2 + sigma**2 / n)
    posterior_sd = np.sqrt((sd_prior**2 * sigma**2) / (sd_prior**2 + sigma**2 / n))
    posterior_means.append(posterior_mean)
    posterior_sds.append(posterior_sd)

# Comment on the obtained mean and standard deviation as n tends towards infinity
print(f"As n tends towards infinity, the posterior mean approaches the true mean of the population, and the posterior standard deviation decreases.")

# Calculate the overall posterior mean and standard deviation
overall_posterior_mean = np.mean(posterior_means)
overall_posterior_sd = np.mean(posterior_sds)

print(f'Overall Posterior Mean for first month: {overall_posterior_mean}')
print(f'Overall Posterior SD for first month: {overall_posterior_sd}')

# Repeat the same for the second month and combine with the first month posterior
df_second_month = combined[(combined['trip_distance'] > 0) & (combined['trip_distance'] <= 10) & (combined['month'] == 'march')]

# Ensure the DataFrame has enough rows to sample from
if df_second_month.shape[0] < n_samples:
    raise ValueError("Not enough data points in the second month to sample 30 points per sample.")

samples_second_month = [df_second_month['trip_distance'].sample(n=n_samples, random_state=i) for i in range(31, 61)]

posterior_means_second = []
posterior_sds_second = []

for sample in samples_second_month:
    X_bar = sample.mean()
    n = len(sample)
    posterior_mean = (overall_posterior_sd**2 * X_bar + sigma**2 * overall_posterior_mean) / (overall_posterior_sd**2 + sigma**2 / n)
    posterior_sd = np.sqrt((overall_posterior_sd**2 * sigma**2) / (overall_posterior_sd**2 + sigma**2 / n))
    posterior_means_second.append(posterior_mean)
    posterior_sds_second.append(posterior_sd)

# Calculate the overall posterior mean and standard deviation for the second month
overall_posterior_mean_second = np.mean(posterior_means_second)
overall_posterior_sd_second = np.mean(posterior_sds_second)

print(f'Overall Posterior Mean after second month: {overall_posterior_mean_second}')
print(f'Overall Posterior SD after second month: {overall_posterior_sd_second}')

As n tends towards infinity, the posterior mean approaches the true mean of the population, and the posterior standard deviation decreases.
Overall Posterior Mean for first month: 4.819749947067543
Overall Posterior SD for first month: 1.9658313526471707
Overall Posterior Mean after second month: 7.344453327025062
Overall Posterior SD after second month: 1.9663657260577097
