In [None]:
import kagglehub
organizations_fivethirtyeight_uber_pickups_in_new_york_city_path = kagglehub.dataset_download('organizations/fivethirtyeight/uber-pickups-in-new-york-city')

print('Data source import complete.')


This notebook contains a basic analysis through some visualizations of the Uber Pickups in New York City data set.

The analysis is broken up into 3 sections:
- Data Loading and Preparation.
- Exploration and visualization of pickups from April to September 2014.
- Conclusion.

# 1. Data Loading and Preparation

### 1.1 Loading Modules

In [None]:
import pandas as pd
import numpy as np

#Visualization modules
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline
from matplotlib import cm #Colormap

### 1.2 Loading Data

In [None]:
#Load the datasets

df_apr14=pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-apr14.csv")
df_may14=pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-may14.csv")
df_jun14=pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-jun14.csv")
df_jul14=pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-jul14.csv")
df_aug14=pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-aug14.csv")
df_sep14=pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-sep14.csv")

In [None]:
#Merge the dataframes into one
df = pd.concat([df_apr14, df_may14, df_jun14, df_jul14, df_aug14, df_sep14], ignore_index=True)

### 1.3 Data Preparation

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Renaming the Date/Time Column
df = df.rename(columns={'Date/Time': 'Date_time'})

# Converting the Date_time type into Datetime
df['Date_time'] = pd.to_datetime(df['Date_time'])

# Adding useful columns
df['Month'] = df['Date_time'].dt.month_name()
df['Weekday'] = df['Date_time'].dt.day_name()
df['Day'] = df['Date_time'].dt.day
df['Hour'] = df['Date_time'].dt.hour
df['Minute'] = df['Date_time'].dt.minute


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include = 'all')

# 2 Exploration and Visualization

Through our exploration we are going to visualize and analyse:
- The number of trips by hour
- The number of trips by month
- The number of trips by weekday
- The number of trips by day


- The number of trips by hour and month
- The number of trips by weekday and hour
- The number of trips by weekday and month



### 2.1 Trips by hour

In [None]:
#Grouping by Hour
df_hour_grouped = df.groupby(['Hour']).count()

#Creating the sub dataframe
df_hour = pd.DataFrame({'Number_of_trips':df_hour_grouped.values[:,0]}, index = df_hour_grouped.index)

df_hour.head()

#### Plotting the results

In [None]:
df_hour.plot(kind='bar', figsize=(8,6))

plt.ylabel('Number of Trips')
plt.title('Trips by Hour')

plt.show()

#### Analysing the results

In [None]:
#The highest number of trips by hour
max_Number_of_trips_hour = max(df_hour['Number_of_trips'])
max_hour = df_hour[df_hour['Number_of_trips'] == 336190].index[0]

print('The highest number of trips by hour is {} trip, that corresponds to the peak hour {}:00.'.format(max_Number_of_trips_hour, max_hour))


We observe that the number of trips are higher around 16:00 and 18:00, with a spike at 17:00. It matches the end of a working day in the United States (16:30), the time when the workers go home.

We can say that the majority of Uber's clients are workers.

### 2.2 Trips by month

In [None]:
#Grouping by Month
df_month_grouped = df.groupby(['Month'], sort=False).count()

#Creating the sub dataframe
df_month = pd.DataFrame({'Number_of_trips':df_month_grouped.values[:,0]}, index = df_month_grouped.index)

df_month

#### Plotting the results

In [None]:
df_month.plot(kind='bar', figsize=(8,6))

plt.ylabel('Number of Trips')
plt.title('Trips by Month')

plt.show()

#### Analysing the results

We observe that the number of trips increases each month, with a peak increase between August and September.

In [None]:
number_of_trips_aug = df_month.loc['August'].values
number_of_trips_sep = df_month.loc['September'].values

ratio_month = (((number_of_trips_sep - number_of_trips_aug) / number_of_trips_aug) * 100)[0]
ratio_month = round(ratio_month)

print('The ratio of the increase from August to September is {} %.'.format(ratio_month))


From our results we can say that from April to September 2014, Uber was in a continuous improvement process.


### 2.3 Trips by weekday

In [None]:
#Grouping by Weekday
df_weekday_grouped = df.groupby(['Weekday'], sort = False).count()

#Creating the grouped DataFrame
df_weekday = pd.DataFrame({'Number_of_trips':df_weekday_grouped.values[:,0]}, index = df_weekday_grouped.index)

df_weekday

#### Plotting the results

In [None]:
df_weekday.plot(kind='bar', figsize=(8,6))

plt.ylabel('Number of Trips')
plt.title('Trips by Weekday')

plt.show()

#### Analysing the results

In [None]:
#Getting the minimum number of trips by weekday
min_number_of_trips_weekday = min(df_weekday['Number_of_trips'])

#Getting the weekday where the number of trips is minimal
min_weekday = df_weekday[df_weekday['Number_of_trips'] == min_number_of_trips_weekday].index[0]

print('The lowest number of trips by weekday is {} trip, that corresponds to {}.'.format(min_number_of_trips_weekday, min_weekday))


In [None]:
#Getting the mean number of trips in the weekend - Non working day
mean_number_of_trips_weekend = ((df_weekday.loc['Saturday'] + df_weekday.loc['Sunday']) / 2).values

#Getting the mean number of trips for the rest of the week- Working day
mean_number_of_trips_workday = (((df_weekday.loc['Monday'] + df_weekday.loc['Tuesday'] + df_weekday.loc['Wednesday'] + df_weekday.loc['Thursday'] + df_weekday.loc['Friday'])/ 5).values)[0]

ratio_weekday = (((mean_number_of_trips_workday - mean_number_of_trips_weekend) / mean_number_of_trips_weekend) * 100)[0]
ratio_weekday = round(ratio_weekday, 1)

print('The mean number of trips during working days is {}% higher than the mean number of trips during weekends.'.format(ratio_weekday))

As the ratio between workdays and weekends only 19.6%, and because of the low number of trips on Monday, it cannot be said that people use Uber on workdays more than on weekends.

We need to investigate more to find out why the number of trips on mondays is as low.

### 2.4 Trips by day

In [None]:
#Grouping by Day
df_day_grouped = df.groupby(['Day']).count()

#Creating the grouped DataFrame
df_day = pd.DataFrame({'Number_of_trips':df_day_grouped.values[:,0]}, index = df_day_grouped.index)

df_day.head()

#### Plotting the results

In [None]:
df_day.plot(kind='bar', figsize=(10,8))

plt.ylabel('Number of Trips')
plt.title('Trips by Day')

plt.show()

#### Analysing the results

The number of trips for the day 31 is a lot less than the others because April, June and September have 30 days.

The day with the highest number of trips is the 30. There's not much variation from day to day.

### 2.5 Trips by hour and month

In [None]:
#Grouping by Hour and Month
df_hour_month_grouped = df.groupby(['Hour','Month']).count()

#Creating the grouped DataFrame
df_hour_month = pd.DataFrame({'Number_of_trips':df_hour_month_grouped.values[:,1]}, index = df_hour_month_grouped.index)

df_hour_month.head(10)

In [None]:
#Reseting the Index
df_hour_month.reset_index(inplace= True)
df_hour_month.head()

In [None]:
#Preparing the Number of trips data
#We create a Numpy array that includes the Number of trips data then reshape it to fit our
data_hour_month = df_hour_month['Number_of_trips'].values.reshape(24,6)
data_hour_month

In [None]:
df_hour_month = pd.DataFrame(data = data_hour_month, index = df_hour_month['Hour'].unique(), columns = df['Month'].unique())
df_hour_month.head()

#### Plotting the results

In [None]:
df_hour_month.plot(kind='bar', figsize=(8,6), stacked=True)

plt.xlabel('Hour')
plt.ylabel('Number of Trips')
plt.title('Trips by Hour and Month')

plt.show()

In [None]:
df_hour_month.plot(kind='bar', figsize=(25,6),width=0.8)

plt.xlabel('Hour')
plt.ylabel('Number of Trips')
plt.title('Trips by Hour and Month')

plt.show()

### 2.6 Trips by weekday and hour

In [None]:
#Grouping by Hour and weekday
df_weekday_hour_grouped = df.groupby(['Weekday','Hour'], sort = False).count()

#Creating the grouped DataFrame
df_weekday_hour = pd.DataFrame({'Number_of_trips':df_weekday_hour_grouped.values[:,1]}, index = df_weekday_hour_grouped.index)

df_weekday_hour

In [None]:
#Reseting the Index
df_weekday_hour.reset_index(inplace= True)

#Preparing the Number of trips data
data_weekday_hour = df_weekday_hour['Number_of_trips'].values.reshape(7,24)

df_weekday_hour = pd.DataFrame(data = data_weekday_hour, index = df_weekday_hour['Weekday'].unique(), columns = df['Hour'].unique())
df_weekday_hour.head()

#### Plotting the results

In [None]:
df_weekday_hour.plot(kind='bar', figsize=(20,6), width = 0.7)

plt.xlabel('Weekday')
plt.ylabel('Number of Trips')
plt.title('Trips by Hour and Weekday')

plt.show()

#### Analysing the results

We see that in working days there's a pulse at 7:00 and 8:00, it corresponds to the hour where the employees go to work. This pulse is not present on weekend days.

At the same time we see that on weekend days the number of trips around midnight, 1:00 and 2:00 is higher than on working days.

### 2.7 Trips by weekday and month


In [None]:
#Grouping by Weekday and Month
df_month_weekday_grouped = df.groupby(['Month','Weekday'], sort=False).count()

#Creating the grouped DataFrame
df_month_weekday = pd.DataFrame({'Number_of_trips':df_month_weekday_grouped.values[:,1]}, index = df_month_weekday_grouped.index)

df_month_weekday.head(10)

In [None]:
#Reseting the Index
df_month_weekday.reset_index(inplace= True)

#Preparing the Number of trips
data_month_weekday = df_month_weekday['Number_of_trips'].values.reshape(6,7)

df_month_weekday = pd.DataFrame(data = data_month_weekday, index = df_month_weekday['Month'].unique(), columns = df['Weekday'].unique())
df_month_weekday.head()

#### Plotting the results

In [None]:
df_month_weekday.plot(kind='bar', figsize=(8,6), stacked = True)

plt.xlabel('Month')
plt.ylabel('Number of Trips')
plt.title('Trips by Month and Weekday')

plt.show()

In [None]:
df_month_weekday.plot(kind='bar', figsize=(18,6), width = 0.6)

plt.xlabel('Month')
plt.ylabel('Number of Trips')
plt.title('Trips by Month and Weekday')

plt.show()

# 3. Heatmap

Through our exploration we are going to visualize:
- Heatmap by Hour and Day.
- Heatmap by Hour and Weekday.
- Heatmap by Month and Day.
- Heatmap by Month and Weekday.

In [None]:
#Defining a function that counts the number of rows
def count_rows(rows):
    return len(rows)

### 3.1 Heatmap by Hour and Day

In [None]:
#Creating the hour and day dataframe
df_hour_day = df.groupby('Hour Day'.split()).apply(count_rows).unstack()
df_hour_day.head()

In [None]:
plt.figure(figsize = (12,8))

#Using the seaborn heatmap function
ax = sns.heatmap(df_hour_day, cmap=cm.YlGnBu, linewidth = .5)
ax.set(title="Trips by Hour and Day");

#### Analysing the results

We see that the number of trips in increasing throughout the day, with a peak demand in the evening between 16:00 and 18:00.

It corresponds to the time where employees finish their work and go home.

### 3.2 Heatmap by Hour and Weekday

In [None]:
df_hour_weekday = df.groupby('Hour Weekday'.split(), sort = False).apply(count_rows).unstack()
df_hour_weekday.head()

In [None]:
plt.figure(figsize = (12,8))

ax = sns.heatmap(df_hour_weekday, cmap=cm.YlGnBu, linewidth = .5)
ax.set(title="Trips by Hour and Weekday");

#### Analysing the results

We can see that on working days (From Monday to Friday) the number of trips is higher from 16:00 to 21:00. It shows even better what we said from the first heatmap.

On Friday the number of trips remains high until 23:00 and continues on early Saturday. It corresponds to the time where people come out from work, then go out for dinner or drink before the weekend.

We can notice the same pattern on Saturday, people tend to go out at night, the number of trips remains on high until early Sunday.

### 3.3 Heatmap by Day and Month

In [None]:
df_day_month = df.groupby('Day Month'.split(), sort = False).apply(count_rows).unstack()
df_day_month.head()

In [None]:
plt.figure(figsize = (12,8))

ax = sns.heatmap(df_day_month, cmap = cm.YlGnBu, linewidth = .5)
ax.set(title="Trips by Day and Month");

#### Analysing the results

We observe that the number of trips increases each month, we can say that from April to September 2014, Uber was in a continuous improvement process.

We can notice from the visualization a dark spot, it corresponds to the 30 April. The number of trips that day was extreme compared to the rest of the month.

Unfortunatly we have not been able to find any factual information to explain the pulse. A successful marketing strategy can be assumed to be in place that days. So as the analysis go on we consider that day an outliner.

In [None]:
#The number of trips the 30th of April
max_april = max(df_day_month['April'])

#The mean number of trips the rest of April
mean_rest_april = df_day_month['April'][0:29].sum() / 29

ratio_april = round(max_april / mean_rest_april)
print('The number of trips the 30th of April is {} times higher than the mean number of trips during the rest of the month'.format(ratio_april))

### 3.4 Heatmap by Month and Weekday

In [None]:
df_month_weekday = df.groupby('Month Weekday'.split(), sort = False).apply(count_rows).unstack()
df_month_weekday.head()

In [None]:
plt.figure(figsize = (12,8))

ax = sns.heatmap(df_month_weekday, cmap= cm.YlGnBu, linewidth = .5)
ax.set(title="Trips by Month and Weekday");

# 4. Conclusion

Through our analysis of the Uber Pickups in New York City data set in 2014, we managed to get the following informations:
- The peak demand hour 17:00.
- The main customer category are workers.
- An indicator of Uber's improvement from April to September.
- People tend to use Uber to go to work around 7:00 and 8:00 on working days.
- People tend to use Uber late at night (around midnight) during weekends.
- We should investigate why people don't use uber on Mondays as much as they do on other working days.

# **Exploratory Data Analysis**

In [None]:
!pip install calmap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error
import calmap
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose

# Load and prepare data
df = pd.concat([df_apr14, df_may14, df_jun14, df_jul14, df_aug14, df_sep14])
df['Date_time'] = pd.to_datetime(df['Date/Time'])
df = df.rename(columns={'Base': 'dispatching_base_number'})

# Extract temporal features
df['date'] = df['Date_time'].dt.date
df['day_of_week'] = df['Date_time'].dt.day_name()
df['hour'] = df['Date_time'].dt.hour
df['month'] = df['Date_time'].dt.month_name()

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12, 6))
df.groupby('date').size().plot()
plt.title('Daily Uber Pickups (Apr-Sep 2014)')
plt.ylabel('Total Trips')
plt.xlabel('Date')
plt.grid()
plt.show()

In [None]:
# Group by hour and plot
plt.figure(figsize=(12, 6))
df.groupby('hour').size().plot(kind='bar', color='skyblue')
plt.title('Popular Pickup Hours')
plt.xlabel('Hour of Day')
plt.ylabel('Total Trips')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Group by day of the week
plt.figure(figsize=(10, 5))
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=order, ordered=True)
df.groupby('day_of_week').size().plot(kind='bar', color='violet')
plt.title('Trips by Day of the Week')
plt.xlabel('Day')
plt.ylabel('Total Trips')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
df['dispatching_base_number'].value_counts().sort_values().plot(kind='barh')
plt.title('Trips by Uber Base')
plt.xlabel('Total Trips')
plt.ylabel('Base Number')
plt.show()

In [None]:
daily_trips = df.groupby('date').size()
result = seasonal_decompose(daily_trips, model='additive', period=7)
result.plot()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
daily_trips.index = pd.to_datetime(daily_trips.index)

In [None]:
plt.figure(figsize=(16,8))
calmap.calendarplot(daily_trips, monthticks=1, daylabels='MTWTFSS',
                    cmap='YlGn', fillcolor='lightgrey',
                    linewidth=0.5, fig_kws={'figsize': (16,8)})
plt.title('Daily Uber Trips Calendar View')
plt.show()

In [None]:
# Create target variable (hourly trips)
hourly_trips = df.groupby(['date', 'hour', 'dispatching_base_number']).size().reset_index(name='trips')

# Feature engineering
hourly_trips['day_of_week'] = pd.to_datetime(hourly_trips['date']).dt.day_name()
hourly_trips['month'] = pd.to_datetime(hourly_trips['date']).dt.month_name()
hourly_trips['weekend'] = hourly_trips['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
hourly_trips['peak_hour'] = hourly_trips['hour'].apply(lambda x: 1 if (7<=x<=9) | (17<=x<=19) else 0)

# Lag features
hourly_trips['lag_24'] = hourly_trips.groupby(['dispatching_base_number'])['trips'].shift(24)
hourly_trips['lag_168'] = hourly_trips.groupby(['dispatching_base_number'])['trips'].shift(168)
hourly_trips = hourly_trips.dropna()

# One-hot encoding
features = pd.get_dummies(hourly_trips, columns=['dispatching_base_number', 'day_of_week', 'month'])

# **Uber Daily trips Demand Prediction**

In [None]:
# Prepare data
X = features.drop(['date', 'trips'], axis=1)
y = features['trips']

# Train-test split (last 2 weeks for testing)
split_date = pd.to_datetime('2014-09-15')
train = features[pd.to_datetime(features['date']) < split_date]
test = features[pd.to_datetime(features['date']) >= split_date]

X_train, y_train = train.drop(['date', 'trips'], axis=1), train['trips']
X_test, y_test = test.drop(['date', 'trips'], axis=1), test['trips']

# Define models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

# Model evaluation
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        'Model': name,
        'R2': r2_score(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred))
    })

results_df = pd.DataFrame(results).sort_values('R2', ascending=False)
results_df

In [None]:
# Generate predictions for all models
for name, model in models.items():
    hourly_trips[f'pred_{name.lower().replace(" ", "_")}'] = model.predict(X)

# Convert date to datetime for proper plotting
hourly_trips['datetime'] = pd.to_datetime(hourly_trips['date']) + pd.to_timedelta(hourly_trips['hour'], unit='h')

In [None]:
test_period = hourly_trips[hourly_trips['datetime'] >= split_date]

plt.figure(figsize=(16, 6))

# Plot actual values
plt.plot(test_period['datetime'], test_period['trips'],
         label='Actual', marker='o', markersize=4, linewidth=1.5)

# Plot predictions
for model_name in models.keys():
    plt.plot(test_period['datetime'], test_period[f'pred_{model_name.lower().replace(" ", "_")}'],
             label=f'{model_name} Predicted', linestyle='--', linewidth=1.5)
    break

plt.title('Test Period: Actual vs Predicted Trips (Last 2 Weeks)')
plt.ylabel('Number of Trips')
plt.xlabel('Date')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create hourly aggregates
hourly_compare = test_period.groupby('hour').agg({
    'trips': 'mean',
    'pred_lightgbm': 'mean',
    'pred_random_forest': 'mean',
    'pred_xgboost': 'mean'
}).reset_index()

plt.figure(figsize=(12, 6))
plt.plot(hourly_compare['hour'], hourly_compare['trips'],
         label='Actual', marker='o', linewidth=2)

for model_name in models.keys():
    plt.plot(hourly_compare['hour'], hourly_compare[f'pred_{model_name.lower().replace(" ", "_")}'],
             label=f'{model_name}', linestyle='--')

plt.title('Average Hourly Pattern: Actual vs Predictions')
plt.xlabel('Hour of Day')
plt.ylabel('Average Trips')
plt.xticks(range(24))
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import matplotlib.dates as mdates

# Filter to daily aggregates
daily_data = hourly_trips.groupby('date').agg({
    'trips': 'sum',
    'pred_random_forest': 'sum',
    'pred_xgboost': 'sum',
    'pred_lightgbm': 'sum'
}).reset_index()
daily_data['date'] = pd.to_datetime(daily_data['date'])

# Create figure
plt.figure(figsize=(14, 7))

# Plot actual values
plt.plot(daily_data['date'], daily_data['trips'],
         label='Actual Trips', color='#1f77b4', linewidth=2.5, alpha=0.9)

# Plot predictions
plt.plot(daily_data['date'], daily_data['pred_random_forest'],
         label='Random Forest Predicted', color='#ff7f0e', linestyle='--', linewidth=1.8)
plt.plot(daily_data['date'], daily_data['pred_xgboost'],
         label='XGBoost Predicted', color='#2ca02c', linestyle='--', linewidth=1.8)
plt.plot(daily_data['date'], daily_data['pred_lightgbm'],
         label='LightGBM Predicted', color='#d62728', linestyle='--', linewidth=1.8)

# Highlight test period
test_start = pd.to_datetime('2014-09-15')
plt.axvspan(test_start, daily_data['date'].max(),
            color='gray', alpha=0.15, label='Test Period')

# Styling
plt.title('Daily Uber Trips: Actual vs Predicted (April-October 2014)',
          fontsize=14, pad=20)
plt.ylabel('Total Daily Trips', fontsize=12)
plt.xlabel('Date', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)

# Format x-axis
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gcf().autofmt_xdate()

# Legend
plt.legend(fontsize=10, framealpha=1)

# Adjust y-axis limits
plt.ylim(0, 45000)

plt.tight_layout()
plt.show()

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(12, 6))
test_dates = pd.to_datetime(test['date']) + pd.to_timedelta(test['hour'], unit='h')

plt.plot(test_dates, y_test, label='Actual', linewidth=2)

for name, model in models.items():
    y_pred = model.predict(X_test)
    plt.plot(test_dates, y_pred, '--', label=f'{name} Predicted', alpha=0.8)

plt.title('Actual vs Predicted Hourly Uber Pickups')
plt.xlabel('Date')
plt.ylabel('Number of Trips')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(12, 6))
test_dates = pd.to_datetime(test['date']) + pd.to_timedelta(test['hour'], unit='h')

plt.plot(test_dates, y_test, label='Actual', linewidth=2)

for name, model in models.items():
    y_pred = model.predict(X_test)
    plt.plot(test_dates, y_pred, '--', label=f'{name} Predicted', alpha=0.8)
    break

plt.title('Actual vs Predicted Hourly Uber Pickups')
plt.xlabel('Date')
plt.ylabel('Number of Trips')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns

# Prepare predictions for full period
full_predictions = features[['date']].copy()
full_predictions['Actual'] = features['trips']
full_predictions['date'] = pd.to_datetime(full_predictions['date'])

# Generate predictions for all models
for name, model in models.items():
    full_predictions[f'{name}_Predicted'] = model.predict(X)

# Aggregate to daily level
daily_agg = full_predictions.groupby(full_predictions['date'].dt.date).agg({
    'Actual': 'sum',
    'Random Forest_Predicted': 'sum',
    'XGBoost_Predicted': 'sum',
    'LightGBM_Predicted': 'sum'
}).reset_index()
daily_agg['date'] = pd.to_datetime(daily_agg['date'])

In [None]:
# Create combined visualization
plt.figure(figsize=(16, 8))

# Plot actual values
plt.plot(daily_agg['date'], daily_agg['Actual'],
         label='Actual Trips', color='black', linewidth=2.5)

# Plot model predictions
model_colors = {
    'Random Forest': 'red',
    'XGBoost': 'blue',
    'LightGBM': 'green'
}

for model_name, color in model_colors.items():
    plt.plot(daily_agg['date'], daily_agg[f'{model_name}_Predicted'],
             linestyle='--', alpha=0.8, linewidth=1.5,
             label=f'{model_name} Predicted', color=color)

# Highlight test period
test_start = pd.to_datetime('2014-09-15')
plt.axvspan(test_start, daily_agg['date'].max(),
            color='yellow', alpha=0.1, label='Test Period')

# Formatting
plt.title('Daily Uber Trips: Actual vs Predicted (April-October 2014)', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Daily Trips', fontsize=12)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

In [None]:
# Individual model visualizations
def plot_single_model(data, model_name, color):
    plt.figure(figsize=(14, 6))

    # Plot actual vs predicted
    plt.plot(data['date'], data['Actual'],
             label='Actual', color='black', linewidth=2)
    plt.plot(data['date'], data[f'{model_name}_Predicted'],
             label=f'{model_name} Predicted', linestyle='--',
             color=color, alpha=0.8)

    # Highlight test period
    plt.axvspan(test_start, data['date'].max(),
                color='yellow', alpha=0.1, label='Test Period')

    # Formatting
    plt.title(f'Daily Trips: {model_name} Predictions', fontsize=14)
    plt.xlabel('Date')
    plt.ylabel('Trips')
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
    plt.xticks(rotation=45)
    plt.grid(alpha=0.2)
    plt.legend()
    plt.show()

# Generate individual plots
for model_name, color in model_colors.items():
    plot_single_model(daily_agg, model_name, color)

In [None]:
# Performance metrics table
test_period = daily_agg[daily_agg['date'] >= test_start]

metrics = []
for model_name in models.keys():
    metrics.append({
        'Model': model_name,
        'R2': r2_score(test_period['Actual'], test_period[f'{model_name}_Predicted']),
        'MAE': mean_absolute_error(test_period['Actual'], test_period[f'{model_name}_Predicted']),
        'RMSE': np.sqrt(mean_squared_error(test_period['Actual'], test_period[f'{model_name}_Predicted']))
    })

performance_df = pd.DataFrame(metrics).sort_values('R2', ascending=False)
print("Test Period Performance Metrics:")
display(performance_df)

In [None]:
# Feature importance comparison
plt.figure(figsize=(14, 8))
all_importances = pd.DataFrame()

for model_name, model in models.items():
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_,
        'Model': model_name
    })
    all_importances = pd.concat([all_importances, importance])

top_features = all_importances.groupby('Feature')['Importance'].mean().nlargest(10).index
sns.barplot(x='Importance', y='Feature', hue='Model',
            data=all_importances[all_importances['Feature'].isin(top_features)])
plt.title('Top 10 Feature Importances Across Models', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Ensure datetime format
df['date'] = pd.to_datetime(df['date'])

# Create proper time series dataframe
daily_counts = df.groupby('date').size().reset_index(name='trip_count')
daily_counts['day_of_week'] = daily_counts['date'].dt.day_name()
daily_counts['month'] = daily_counts['date'].dt.month_name()
daily_counts['day_of_month'] = daily_counts['date'].dt.day
daily_counts['is_weekend'] = daily_counts['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 8))

# Weekly pattern
plt.subplot(2, 2, 1)
sns.boxplot(data=daily_counts, x='day_of_week', y='trip_count',
            order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
plt.title('Weekly Pattern')

# Monthly pattern
plt.subplot(2, 2, 2)
sns.boxplot(data=daily_counts, x='month', y='trip_count')
plt.title('Monthly Pattern')

# Daily trend
plt.subplot(2, 1, 2)
plt.plot(daily_counts['date'], daily_counts['trip_count'])
plt.title('Daily Trip Count Trend')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
import holidays

# Data Preparation
df = pd.concat([df_apr14, df_may14, df_jun14, df_jul14, df_aug14, df_sep14])
df['Date_time'] = pd.to_datetime(df['Date/Time'])
df = df.rename(columns={'Base': 'dispatching_base_number'})

# Extract temporal features
df['date'] = df['Date_time'].dt.date
df['hour'] = df['Date_time'].dt.hour

# Aggregate to hourly trips
hourly_trips = df.groupby(['date', 'hour', 'dispatching_base_number']).size().reset_index(name='trips')

# Enrich features
hourly_trips['day_of_week'] = pd.to_datetime(hourly_trips['date']).dt.day_name()
hourly_trips['month'] = pd.to_datetime(hourly_trips['date']).dt.month_name()
hourly_trips['weekend'] = hourly_trips['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
hourly_trips['peak_hour'] = hourly_trips['hour'].apply(lambda x: 1 if (7 <= x <= 9) or (17 <= x <= 19) else 0)

# Lag and rolling features
for lag in [1, 2, 24, 168]:
    hourly_trips[f'lag_{lag}'] = hourly_trips.groupby('dispatching_base_number')['trips'].shift(lag)
hourly_trips['rolling_mean_24'] = hourly_trips.groupby('dispatching_base_number')['trips'].transform(lambda x: x.shift(1).rolling(24).mean())

# Cyclical time features
hourly_trips['hour_sin'] = np.sin(2 * np.pi * hourly_trips['hour'] / 24)
hourly_trips['hour_cos'] = np.cos(2 * np.pi * hourly_trips['hour'] / 24)

# Holiday indicator
us_holidays = holidays.US()
hourly_trips['is_holiday'] = pd.to_datetime(hourly_trips['date']).isin(us_holidays).astype(int)

# Clean
hourly_trips = hourly_trips.dropna()

# One-hot encoding
features = pd.get_dummies(hourly_trips, columns=['dispatching_base_number', 'day_of_week', 'month'])

# Train-test split
split_date = pd.to_datetime('2014-09-15')
features['date'] = pd.to_datetime(features['date'])

train = features[features['date'] < split_date]
test = features[features['date'] >= split_date]

X_train = train.drop(['date', 'trips'], axis=1)
y_train = np.log1p(train['trips'])  # log transform

X_test = test.drop(['date', 'trips'], axis=1)
y_test = test['trips']

# Model Training
model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    num_leaves=64,
    random_state=42,
    verbose=-1
)
model.fit(X_train, y_train)

# Prediction
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # reverse log

# Evaluation
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

# Plotting
plt.figure(figsize=(15, 5))
plt.plot(y_test.values, label="Actual Trips", linewidth=2)
plt.plot(y_pred, label="Predicted Trips (LightGBM)", linestyle="--")
plt.title("Uber Hourly Trips: Actual vs Predicted")
plt.xlabel("Test Time Index")
plt.ylabel("Trips")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Add predictions back to test dataframe
test = test.copy()
test['predicted_trips'] = y_pred
test['actual_trips'] = y_test.values
test['date'] = pd.to_datetime(test['date'])

# Aggregate to daily
daily_results = test.groupby('date')[['predicted_trips', 'actual_trips']].sum().reset_index()

In [None]:
# Predict on train set for full comparison
y_train_pred = model.predict(X_train)
y_train_pred = np.expm1(y_train_pred)  # Inverse log

# Merge predictions back into train and test sets
train = train.copy()
train['predicted_trips'] = y_train_pred
train['actual_trips'] = np.expm1(y_train)  # Inverse log
train['date'] = pd.to_datetime(train['date'])

test = test.copy()
test['predicted_trips'] = y_pred
test['actual_trips'] = y_test.values
test['date'] = pd.to_datetime(test['date'])

# Combine train + test
combined = pd.concat([train, test])

#Aggregate to daily level
daily_results = combined.groupby('date')[['predicted_trips', 'actual_trips']].sum().reset_index()

# Plot full daily range
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 6))

# Actual
plt.plot(daily_results['date'], daily_results['actual_trips'], label='Actual Trips', color='steelblue', linewidth=2)

# LightGBM predicted
plt.plot(daily_results['date'], daily_results['predicted_trips'], label='LightGBM Predicted', linestyle='--', color='orange')

# Highlight test period
test_start = test['date'].min()
test_end = test['date'].max()
plt.axvspan(test_start, test_end, color='gray', alpha=0.2, label='Test Period')

# Format x-axis
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.xticks(rotation=45)

# Labels and legend
plt.title('Daily Uber Trips: Actual vs LightGBM Predicted (April–September 2014)', fontsize=14)
plt.xlabel('Date')
plt.ylabel('Total Daily Trips')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=360)
plt.show()