In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
df=pd.read_csv('/content/NYC_Ferry_Ridership_20250325.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.drop('Stop',axis=1,inplace=True)

In [None]:
df['Route'].unique()

In [None]:
df['Route'].value_counts()

In [None]:
df=df[df['Route']!='RR']

In [None]:
Route_mapping={'SB':'South Brooklyn','AS':'Astoria','ER':'East River','SV':'Soundview','SG':'St. George','RW':'Rockaway','GI':'Governors Island'}

In [None]:
df['Route']=df['Route'].map(Route_mapping)

In [None]:
Direction_mapping={'NB':0,'SB':1}

In [None]:
df['Direction']=df['Direction'].map(Direction_mapping)

In [None]:
df['Date']=pd.to_datetime(df['Date'])

In [None]:
df.head()

In [None]:
df_weekdays = df[df['Date'].dt.weekday < 5]  # Weekdays are 0 to 4 (Monday to Friday)

# Group by route and date, then sum the boardings for each day
df_route_weekday = df_weekdays.groupby(['Route', 'Date'])['Boardings'].sum().reset_index()

# Calculate the average daily boardings for each route on weekdays
avg_daily_boardings_weekday = df_route_weekday.groupby('Route')['Boardings'].mean()
avg_daily_boardings_weekday_sorted = avg_daily_boardings_weekday.sort_values(ascending=False)
#Plotting the Bar Graph of Average Daily Boardings for Each Route on Weekdays
plt.figure(figsize=(10, 6))
sns.barplot(y=avg_daily_boardings_weekday_sorted.index, x=avg_daily_boardings_weekday_sorted.values, color='lightcoral')

plt.xlabel('Average Daily Boardings')
plt.title('Average Daily Boardings for Each Route on Weekdays')
plt.show()

In [None]:
route_daily_operations = df_weekdays.groupby(['Date', 'Route']).size().reset_index(name='Operation Count')

# Compute average operations per route
avg_route_operations = route_daily_operations.groupby('Route')['Operation Count'].mean().reset_index()
avg_route_operations=avg_route_operations.sort_values(by='Operation Count',ascending=False)
# Rename columns for clarity
avg_route_operations.columns = ['Route', 'Avg Operations Per Day']

# Set figure size
plt.figure(figsize=(10, 5))

# Plot bar chart
sns.barplot(data=avg_route_operations, y='Route', x='Avg Operations Per Day',color='lightblue')

# Add title and labels
plt.title('Daily operations  Each Route Operates Per Day on Weekdays')
plt.ylabel('Route')
plt.xlabel('Average Operations Per Day')
# Show plot
plt.show()

In [None]:
from matplotlib.ticker import FuncFormatter
def format_func(value, tick_number):
    if value >= 1000:
        return f'{int(value/1000)}K'  # Convert to 'K' notation
    return int(value)

plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func))

In [None]:
hourly_data = df.groupby(['Hour','Direction'])['Boardings'].sum().unstack()

# Plot bar chart
plt.figure(figsize=(12, 6))
hourly_data.plot(kind='bar', stacked=False,width=0.8, colormap='coolwarm')

# Labels and title
plt.xlabel("Hour of the Day")
plt.ylabel("Total Boardings")
plt.title("Hourly Boardings Based on Direction")
plt.xticks(rotation=0)
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func))
plt.legend(["Southbound", "Northbound"], title="Direction")

In [None]:
avg_hourly_boardings = df_weekdays.groupby(['Route','Hour'])['Boardings'].mean().reset_index()
avg_hourly_boardings['Hour-Route'] = avg_hourly_boardings['Hour'].astype(str) + ' - ' + avg_hourly_boardings['Route']


plt.figure(figsize=(12, 6))
sns.barplot(data=avg_hourly_boardings,x='Hour',y='Boardings',hue='Route',dodge=True)
# Add title and labels
plt.title('Average Hourly Boardings per Route on Weekdays')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Boardings')
plt.xticks(rotation=0)  # Keep hour labels horizontal for clarity
plt.legend(title="Route", bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
operations_per_hour_per_route = df_weekdays.groupby(['Hour', 'Route','Date']).size().reset_index(name='Operation Count')

# Calculate the average operations per hour per route over all available weekdays
avg_operations_per_hour_per_route = operations_per_hour_per_route.groupby(['Hour', 'Route'])['Operation Count'].mean().reset_index()

# Set figure size
plt.figure(figsize=(12, 6))

# Create a grouped bar plot with routes for each hour
sns.barplot(data=avg_operations_per_hour_per_route, x='Hour', y='Operation Count', hue='Route',dodge=True)

# Add title and labels
plt.title('Average Number of Operations Per Hour Per Route Per Day (Weekdays)')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Operation Count')
plt.xticks(rotation=0)  # Keep hour labels horizontal for clarity
plt.legend(title="Route", bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend to the side

# Show plot
plt.show()

In [None]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month

# Group by 'year' and 'month' and sum the 'boardings'
monthly_boardings = df.groupby(['year', 'month'])['Boardings'].sum().reset_index()
# Create a datetime column for the first day of each month
monthly_boardings['date'] = pd.to_datetime(monthly_boardings[['month','year']].assign(day=1))

# Sort the DataFrame by the 'date
plt.figure(figsize=(12, 6))
plt.plot(monthly_boardings['date'], monthly_boardings['Boardings'], marker='o', linestyle='-')
plt.title('Yearly Total Boardings')
plt.xlabel('Date')
plt.ylabel('Total Boardings')
plt.grid(True)
plt.show()

In [None]:
# Extract temporal features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [None]:
# Encode categorical variables (stop, route, typeday)
label_enc = LabelEncoder()
df['Route'] = label_enc.fit_transform(df['Route'])
df['TypeDay'] = label_enc.fit_transform(df['TypeDay'])

# Define features (X) and target variable (Y)
X = df[['Year','Month','Day','Hour','Route','Direction','TypeDay']]
y = df['Boardings']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#model = RandomForestRegressor(n_estimators=300, random_state=42)
#model.fit(X_train, y_train)

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.05,subsample=0.8,colsample_bytree=0.8,max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

In [None]:
importances = xgb_model.feature_importances_
feature_names = X.columns

importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the DataFrame based on importance values
importances_df = importances_df.sort_values(by='Importance', ascending=True)

# Plot using Seaborn
plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=importances_df, palette="pastel")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importance in Predicting Boardings")
plt.show()