# Exploratory Data Analysis - Bicycle Rentals in the Chicago Area

## Setup

In [None]:
#!pip install folium
#!pip install geopandas

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import sweetviz
import folium
from folium import plugins
import geopandas

In [4]:
data = pd.read_csv('../../../Desktop/SamX/Bike_Study_Files/big_raw.csv')
data.shape

(5733451, 13)

In [5]:
# Convert 'started_at' and 'ended_at' to datetime objects
data['started_at'] = pd.to_datetime(data['started_at'])
data['ended_at'] = pd.to_datetime(data['ended_at'])

# Calculate ride duration in minutes
data['ride_duration'] = (data['ended_at'] - data['started_at']).dt.total_seconds() / 60

# Additional columns for temporal analysis
data['start_date'] = data['started_at'].dt.date
data['start_hour'] = data['started_at'].dt.hour
data['day_of_week'] = data['started_at'].dt.day_name()
data['week_of_year'] = data['started_at'].dt.isocalendar().week

# Preview the data with new columns
data[['ride_id', 'rideable_type', 'member_casual', 'started_at', 'ended_at', 'ride_duration', 'start_date', 'start_hour', 'day_of_week', 'week_of_year']].head()

Unnamed: 0,ride_id,rideable_type,member_casual,started_at,ended_at,ride_duration,start_date,start_hour,day_of_week,week_of_year
0,46F8167220E4431F,electric_bike,member,2021-12-07 15:06:07,2021-12-07 15:13:42,7.583333,2021-12-07,15,Tuesday,49
1,73A77762838B32FD,electric_bike,casual,2021-12-11 03:43:29,2021-12-11 04:10:23,26.9,2021-12-11,3,Saturday,49
2,4CF42452054F59C5,electric_bike,member,2021-12-15 23:10:28,2021-12-15 23:23:14,12.766667,2021-12-15,23,Wednesday,50
3,3278BA87BF698339,classic_bike,member,2021-12-26 16:16:10,2021-12-26 16:30:53,14.716667,2021-12-26,16,Sunday,51
4,6FF54232576A3B73,electric_bike,member,2021-12-30 11:31:05,2021-12-30 11:51:21,20.266667,2021-12-30,11,Thursday,52


## Fun with Mapping!!!

#### Map with 1000 Samples

In [7]:
geo_loc = data[['member_casual', 'rideable_type', 'ride_duration', 'start_lat', 'start_lng', 'end_lat', 'end_lng']].sample(n=1000)
geo_loc.shape

(1000, 7)

In [None]:
geo_loc['start_lat']

In [None]:
geo_loc.to_csv('geo_sample.csv')

In [8]:
geo_loc = pd.read_csv('geo_sample.csv')
geo_loc.shape

(1000, 8)

In [9]:
geo_loc


Unnamed: 0.1,Unnamed: 0,member_casual,rideable_type,ride_time,start_lat,start_lng,end_lat,end_lng
0,4234855,member,classic_bike,1503.0,41.940775,-87.639192,41.994780,-87.660285
1,153371,member,classic_bike,104.0,41.875010,-87.673280,41.872950,-87.669130
2,3669318,casual,electric_bike,566.0,41.931905,-87.677856,41.939398,-87.711561
3,3129412,casual,classic_bike,682.0,41.918306,-87.636282,41.932418,-87.652705
4,3957164,casual,electric_bike,1086.0,41.870000,-87.620000,41.886024,-87.624117
...,...,...,...,...,...,...,...,...
995,662311,casual,classic_bike,495.0,41.854184,-87.619154,41.865312,-87.617867
996,5387174,member,electric_bike,231.0,41.929507,-87.643208,41.918306,-87.636282
997,339370,member,classic_bike,1012.0,41.925858,-87.638973,41.900960,-87.623777
998,5062473,member,classic_bike,580.0,41.791478,-87.599861,41.794853,-87.618691


In [10]:
# Create a Folium map centered at a specific location
m = folium.Map(location=[geo_loc['start_lat'][1], geo_loc['start_lng'][1]])

# Add markers for each city
for i, row in geo_loc.iterrows():
    folium.Marker(
        location=[row['start_lat'], row['start_lng']]
    ).add_to(m)

# Display the map
m.save("geographical_sample.html")


In [11]:
m

#### Heatmap with 1,000 Samples

In [12]:
# Create point geometries
geometry = geopandas.points_from_xy(geo_loc['start_lng'], geo_loc['start_lat'])
geo_df = geopandas.GeoDataFrame(
    geo_loc[['member_casual', 'rideable_type', 'ride_time', 'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
    geometry=geometry
)

geo_df.head()

Unnamed: 0,member_casual,rideable_type,ride_time,start_lat,start_lng,end_lat,end_lng,geometry
0,member,classic_bike,1503.0,41.940775,-87.639192,41.99478,-87.660285,POINT (-87.63919 41.94078)
1,member,classic_bike,104.0,41.87501,-87.67328,41.87295,-87.66913,POINT (-87.67328 41.87501)
2,casual,electric_bike,566.0,41.931905,-87.677856,41.939398,-87.711561,POINT (-87.67786 41.93191)
3,casual,classic_bike,682.0,41.918306,-87.636282,41.932418,-87.652705,POINT (-87.63628 41.91831)
4,casual,electric_bike,1086.0,41.87,-87.62,41.886024,-87.624117,POINT (-87.62000 41.87000)


In [13]:
from folium import plugins

map = folium.Map(location=[geo_loc['start_lat'][1], geo_loc['start_lng'][1]])

heat_data = [[point.xy[1][0], point.xy[0][0]] for point in geo_df.geometry]

heat_data
plugins.HeatMap(heat_data).add_to(map)

map

#### Heatmap with 100,000 Samples

In [None]:
geo_loc = df[['member_casual', 'rideable_type', 'ride_time', 'start_lat', 'start_lng', 'end_lat', 'end_lng']].sample(n=100000)
geo_loc.shape

In [None]:
geo_loc.to_csv('100K_Samples.csv', index=False)

In [None]:
medium = pd.read_csv('100K_Samples.csv')
medium.shape

In [None]:
# Create point geometries
geometry = geopandas.points_from_xy(medium['start_lng'], medium['start_lat'])
geo_df_med = geopandas.GeoDataFrame(
    medium[['member_casual', 'rideable_type', 'ride_time', 'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
    geometry=geometry
)

geo_df_med.head()

In [None]:
map2 = folium.Map(location=[medium['start_lat'][1], medium['start_lng'][1]])

heat_data = [[point.xy[1][0], point.xy[0][0]] for point in geo_df_med.geometry]

heat_data
plugins.HeatMap(heat_data).add_to(map2)

map2

#### Heatmap with Station Markers

In [None]:
geo_loc = df[['member_casual', 'rideable_type', 'ride_time', 'start_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']].sample(n=100000)
geo_loc.shape

geo_loc.to_csv('100K_Samples_2.csv', index=False)

In [2]:
medium2 = pd.read_csv('100K_Samples_2.csv')
medium2.shape

(100000, 8)

In [None]:
medium2 = medium2.sample(n=1000)

In [None]:
# Create point geometries
geometry = geopandas.points_from_xy(medium2['start_lng'], medium2['start_lat'])
geo_df_med2 = geopandas.GeoDataFrame(
    medium2[['member_casual', 'rideable_type', 'ride_time', 'start_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
    geometry=geometry
)

geo_df_med2.head()

In [None]:
medium2['start_lat'][77005]

In [None]:
map3 = folium.Map(location=[medium2['start_lat'][77005], medium2['start_lng'][77005]])

# Add markers for each station
for i, row in medium2.iterrows():
    folium.Marker(
        location=[row['start_lat'], row['start_lng']],
        tooltip = row['start_station_name']
    ).add_to(map3)

heat_data = [[point.xy[1][0], point.xy[0][0]] for point in geo_df_med2.geometry]

heat_data
plugins.HeatMap(heat_data).add_to(map3)

map3

### Geographical Distribution: Member vs Casual

#### Sample Size 5,000

In [14]:
hundred_K = pd.read_csv('100K_Samples_2.csv')
hundred_K.shape

(100000, 8)

In [15]:
five_K = hundred_K.sample(n=5000)
five_K.shape

(5000, 8)

In [16]:
five_K_groups = five_K.groupby('member_casual')

In [17]:
casual = five_K_groups.get_group('casual')
members = five_K_groups.get_group('member')
print(casual.shape, members.shape)

(2045, 8) (2955, 8)


In [18]:
start_loc = [41.88, -87.62]
label_loc = [41.88, -87.62]

combined_map = plugins.DualMap(location=start_loc, tiles='openstreetmap', zoom_start=14)

# HeatMap for Casual
casual.reset_index(drop=True)
geometry_c = geopandas.points_from_xy(casual['start_lng'], casual['start_lat'])
geo_group_c = geopandas.GeoDataFrame(
        casual[['member_casual', 'rideable_type', 'ride_time', 'start_station_name', 
                'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
        geometry=geometry_c
)

heat_data_c = [[point.xy[1][0], point.xy[0][0]] for point in geo_group_c.geometry]
plugins.HeatMap(heat_data_c).add_to(combined_map.m2)


# HeatMap for Members
members.reset_index(drop=True)
geometry_m = geopandas.points_from_xy(members['start_lng'], members['start_lat'])
geo_group_m = geopandas.GeoDataFrame(
        members[['member_casual', 'rideable_type', 'ride_time', 'start_station_name', 
                'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
        geometry=geometry_m
)

heat_data_m = [[point.xy[1][0], point.xy[0][0]] for point in geo_group_m.geometry]
plugins.HeatMap(heat_data_m).add_to(combined_map.m1)


combined_map

In [19]:
one_K_casual = casual.sample(n=1000)
one_K_members = members.sample(n=1000)

In [20]:
start_loc = [41.88, -87.62]
label_loc = [41.88, -87.62]

combined_map_even = plugins.DualMap(location=start_loc, tiles='openstreetmap', zoom_start=14)

# HeatMap for Casual
one_K_casual.reset_index(drop=True)
geometry_c = geopandas.points_from_xy(one_K_casual['start_lng'], one_K_casual['start_lat'])
geo_group_c = geopandas.GeoDataFrame(
       one_K_casual[['member_casual', 'rideable_type', 'ride_time', 'start_station_name', 
                'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
        geometry=geometry_c
)

heat_data_c = [[point.xy[1][0], point.xy[0][0]] for point in geo_group_c.geometry]
plugins.HeatMap(heat_data_c).add_to(combined_map_even.m2)


# HeatMap for Members
one_K_members.reset_index(drop=True)
geometry_m = geopandas.points_from_xy(one_K_members['start_lng'], one_K_members['start_lat'])
geo_group_m = geopandas.GeoDataFrame(
        one_K_members[['member_casual', 'rideable_type', 'ride_time', 'start_station_name', 
                'start_lat', 'start_lng', 'end_lat', 'end_lng']], 
        geometry=geometry_m
)

heat_data_m = [[point.xy[1][0], point.xy[0][0]] for point in geo_group_m.geometry]
plugins.HeatMap(heat_data_m).add_to(combined_map_even.m1)


combined_map_even