# Importing Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

# Import dataset

In [3]:
df = pd.read_csv('tripdata.csv',engine='python')
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,578DDD7CE1771FFA,classic_bike,2024-11-07 19:21:58.206,2024-11-07 19:28:57.301,Walsh Park,18067,Leavitt St & North Ave,TA1308000005,41.91461,-87.667968,41.910533,-87.682308,member
1,78B141C50102ABA6,classic_bike,2024-11-22 14:49:00.431,2024-11-22 14:56:15.475,Walsh Park,18067,Leavitt St & Armitage Ave,TA1309000029,41.91461,-87.667968,41.917805,-87.682437,member
2,1E794CF36394E2D7,classic_bike,2024-11-08 09:24:00.238,2024-11-08 09:28:33.480,Walsh Park,18067,Damen Ave & Cortland St,13133,41.91461,-87.667968,41.915983,-87.677335,member
3,E5DD2CAB58D73F98,classic_bike,2024-11-24 17:51:14.144,2024-11-24 18:05:32.574,Clark St & Elm St,TA1307000039,Clark St & Drummond Pl,TA1307000142,41.902973,-87.63128,41.931248,-87.644336,member
4,57F9878BC8C765F1,classic_bike,2024-11-04 14:59:16.032,2024-11-04 15:41:02.456,Clark St & Wellington Ave,TA1307000136,Streeter Dr & Grand Ave,13022,41.936497,-87.647539,41.892278,-87.612043,casual


# Understanding dataset

In [4]:
df.shape

(335075, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335075 entries, 0 to 335074
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             335075 non-null  object 
 1   rideable_type       335075 non-null  object 
 2   started_at          335075 non-null  object 
 3   ended_at            335075 non-null  object 
 4   start_station_name  278872 non-null  object 
 5   start_station_id    278872 non-null  object 
 6   end_station_name    277431 non-null  object 
 7   end_station_id      277431 non-null  object 
 8   start_lat           335075 non-null  float64
 9   start_lng           335075 non-null  float64
 10  end_lat             334802 non-null  float64
 11  end_lng             334802 non-null  float64
 12  member_casual       335075 non-null  object 
dtypes: float64(4), object(9)
memory usage: 33.2+ MB


In [6]:
# Correct dtypes
df['ended_at'] = pd.to_datetime(df['ended_at'])
df['started_at'] = pd.to_datetime(df['started_at'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335075 entries, 0 to 335074
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ride_id             335075 non-null  object        
 1   rideable_type       335075 non-null  object        
 2   started_at          335075 non-null  datetime64[ns]
 3   ended_at            335075 non-null  datetime64[ns]
 4   start_station_name  278872 non-null  object        
 5   start_station_id    278872 non-null  object        
 6   end_station_name    277431 non-null  object        
 7   end_station_id      277431 non-null  object        
 8   start_lat           335075 non-null  float64       
 9   start_lng           335075 non-null  float64       
 10  end_lat             334802 non-null  float64       
 11  end_lng             334802 non-null  float64       
 12  member_casual       335075 non-null  object        
dtypes: datetime64[ns](2), float64

# Data Cleaning

In [8]:
df.isnull().sum()

Unnamed: 0,0
ride_id,0
rideable_type,0
started_at,0
ended_at,0
start_station_name,56203
start_station_id,56203
end_station_name,57644
end_station_id,57644
start_lat,0
start_lng,0


In [9]:
# start_station_name, start_station_id, end_station_name, end_station_id,end_lat,end_lng

In [10]:
df = df.dropna(subset=['start_station_name', 'start_station_id', 'end_station_name', 'end_station_id','end_lat','end_lng'])

In [11]:
df.shape

(245971, 13)

# Feature Engineering

In [12]:
df['ride_length'] = (df['ended_at'] - df['started_at'])

In [13]:
df['ride_length'] = df['ride_length'].apply(lambda x: '{:02d}:{:02d}:{:02d}'.format(int(x.total_seconds() // 3600), int((x.total_seconds() // 60) % 60), int(x.total_seconds() % 60)))

In [14]:
df['day_of_week'] = df['started_at'].dt.day_name()

In [15]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week
0,578DDD7CE1771FFA,classic_bike,2024-11-07 19:21:58.206,2024-11-07 19:28:57.301,Walsh Park,18067,Leavitt St & North Ave,TA1308000005,41.91461,-87.667968,41.910533,-87.682308,member,00:06:59,Thursday
1,78B141C50102ABA6,classic_bike,2024-11-22 14:49:00.431,2024-11-22 14:56:15.475,Walsh Park,18067,Leavitt St & Armitage Ave,TA1309000029,41.91461,-87.667968,41.917805,-87.682437,member,00:07:15,Friday
2,1E794CF36394E2D7,classic_bike,2024-11-08 09:24:00.238,2024-11-08 09:28:33.480,Walsh Park,18067,Damen Ave & Cortland St,13133,41.91461,-87.667968,41.915983,-87.677335,member,00:04:33,Friday
3,E5DD2CAB58D73F98,classic_bike,2024-11-24 17:51:14.144,2024-11-24 18:05:32.574,Clark St & Elm St,TA1307000039,Clark St & Drummond Pl,TA1307000142,41.902973,-87.63128,41.931248,-87.644336,member,00:14:18,Sunday
4,57F9878BC8C765F1,classic_bike,2024-11-04 14:59:16.032,2024-11-04 15:41:02.456,Clark St & Wellington Ave,TA1307000136,Streeter Dr & Grand Ave,13022,41.936497,-87.647539,41.892278,-87.612043,casual,00:41:46,Monday


# EDA

## 1. Rides by Day of Week

In [16]:
# Define custom order for days of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Count rides by day of week
ride_counts = df['day_of_week'].value_counts().reset_index()
ride_counts.columns = ['day_of_week', 'ride_count']

# Sort by custom day order
ride_counts['day_of_week'] = pd.Categorical(ride_counts['day_of_week'], categories=day_order, ordered=True)
ride_counts = ride_counts.sort_values('day_of_week')

# Plot using Plotly
fig = px.bar(ride_counts, x='day_of_week', y='ride_count', color='day_of_week', title='Rides by Day of Week')
fig.update_traces(texttemplate='%{value}')
fig.show()

## 2. Rides by Member/Casual and Day of Week'

In [17]:
ride_counts = df.groupby(['member_casual', 'day_of_week']).size().reset_index(name='ride_count')

# Sort by custom day order
ride_counts['day_of_week'] = pd.Categorical(ride_counts['day_of_week'], categories=day_order, ordered=True)
ride_counts = ride_counts.sort_values('day_of_week')

# Plot using Plotly
fig = px.bar(ride_counts, x='day_of_week', y='ride_count', color='member_casual', barmode='group', title='Rides by Member/Casual and Day of Week')
fig.update_traces(texttemplate='%{value}', textposition='auto')
fig.show()

## 3. Number of Rides by Member/Casual

In [18]:
# Group by 'member_casual'
ride_counts = df['member_casual'].value_counts().reset_index()
ride_counts.columns = ['member_casual', 'ride_count']

# Plot using Plotly Express
fig = px.bar(ride_counts, x='member_casual', y='ride_count', title='Number of Rides by Member/Casual')
fig.update_layout(xaxis_title='Member/Casual', yaxis_title='Number of Rides')
fig.show()

## 4. Ride Duration Distribution by Member/Casual

In [19]:
# Calculate ride duration
df['ride_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# Plot using Plotly Express
fig = px.histogram(df, x='ride_duration', color='member_casual', barmode='overlay', title='Ride Duration Distribution by Member/Casual')
fig.update_layout(xaxis_title='Ride Duration (minutes)', yaxis_title='Count')
fig.show()

Output hidden; open in https://colab.research.google.com to view.

## 5. Ride Frequency Distribution for Casual Riders

In [20]:
# Filter data for casual riders
casual_riders = df[df['member_casual'] == 'casual']

# Calculate ride frequency
casual_riders['ride_frequency'] = casual_riders.groupby('ride_id')['ride_id'].transform('count')

# Plot histogram of ride frequency
fig = px.histogram(casual_riders, x='ride_frequency', title='Ride Frequency Distribution for Casual Riders')
fig.show()

## 6. Ride Duration vs. Distance for Casual Riders

In [21]:
# Filter data for casual riders
casual_riders = df[df['member_casual'] == 'casual']

# Calculate ride duration and distance
casual_riders['ride_duration'] = (casual_riders['ended_at'] - casual_riders['started_at']).dt.total_seconds() / 60
casual_riders['ride_distance'] = ((casual_riders['end_lat'] - casual_riders['start_lat']) ** 2 + (casual_riders['end_lng'] - casual_riders['start_lng']) ** 2) ** 0.5

# Plot scatter plot using Plotly Express
fig = px.scatter(casual_riders, x='ride_distance', y='ride_duration', trendline='ols', title='Ride Duration vs. Distance for Casual Riders')
fig.update_layout(xaxis_title='Ride Distance', yaxis_title='Ride Duration (minutes)')
fig.show()

Output hidden; open in https://colab.research.google.com to view.

## 7. Ride Frequency by Time of Day and Day of Week for Casual Riders

In [22]:
# Filter data for casual riders
casual_riders = df[df['member_casual'] == 'casual']

# Extract hour of day and day of week from start time
casual_riders['hour_of_day'] = casual_riders['started_at'].dt.hour
casual_riders['day_of_week'] = casual_riders['started_at'].dt.day_name()

# Group by hour of day and day of week, and count ride frequency
ride_frequency = casual_riders.groupby(['day_of_week', 'hour_of_day']).size().reset_index(name='ride_count')

# Heatmap using Plotly Express
fig = px.density_heatmap(ride_frequency, x='hour_of_day', y='day_of_week', z='ride_count', nbinsx=24, nbinsy=7, title='Ride Frequency by Time of Day and Day of Week for Casual Riders')
fig.update_layout(xaxis_title='Hour of Day', yaxis_title='Day of Week')
fig.show()


## 8. Ride Frequency by Day of Week and Hour of Day for Casual Riders

In [23]:
# Bar chart using Plotly Express
fig = px.bar(ride_frequency, x='day_of_week', y='ride_count', color='hour_of_day', title='Ride Frequency by Day of Week and Hour of Day for Casual Riders')
fig.update_layout(xaxis_title='Day of Week', yaxis_title='Ride Count')
fig.show()

In [24]:
# Filter data for casual riders
casual_riders = df[df['member_casual'] == 'casual']

# Extract hour of day and day of week from start time
casual_riders['hour_of_day'] = casual_riders['started_at'].dt.hour
casual_riders['day_of_week'] = casual_riders['started_at'].dt.day_name()

# Define custom order for days of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Group by hour of day and day of week, and count ride frequency
ride_frequency = casual_riders.groupby(['day_of_week', 'hour_of_day']).size().reset_index(name='ride_count')

# Sort by custom day order
ride_frequency['day_of_week'] = pd.Categorical(ride_frequency['day_of_week'], categories=day_order, ordered=True)
ride_frequency = ride_frequency.sort_values('day_of_week')

# Heatmap using Plotly Express
fig = px.density_heatmap(ride_frequency, x='hour_of_day', y='day_of_week', z='ride_count',
                        nbinsx=24, nbinsy=7, title='Ride Frequency Heatmap for Casual Riders',
                        color_continuous_scale='Viridis')  # You can change the color scale if desired

fig.update_layout(xaxis_title='Hour of Day', yaxis_title='Day of Week')
fig.show()