In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sk

bike_daily = pd.read_csv('daily_weather.csv')
bike_master = pd.read_csv('bike_full.csv', dtype={5: str, 7: str})  # Convert both columns to strings
bike_hourly = pd.read_csv('hourly_weather.csv')



In [4]:
bike_master_clean = bike_master.copy()

# Convert datetime columns
bike_master_clean['started_at'] = pd.to_datetime(bike_master_clean['started_at'])
bike_master_clean['ended_at'] = pd.to_datetime(bike_master_clean['ended_at'])

# Convert category columns
category_columns = ['rideable_type', 'start_station_name', 'end_station_name', 'member_casual']
for col in category_columns:
    bike_master_clean[col] = bike_master_clean[col].astype('category')

# Convert string columns (IDs)
string_columns = ['ride_id', 'start_station_id', 'end_station_id']
for col in string_columns:
    bike_master_clean[col] = bike_master_clean[col].astype('string')



In [5]:
bike_master_clean.dtypes
bike_master_clean.head()
bike_master_clean.shape
null_percentages = (bike_master_clean.isnull().sum() / len(bike_master_clean)) * 100
bike_master_clean_no_nulls = bike_master_clean.dropna()
bike_master_clean_no_nulls.isnull().sum()


ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64

In [6]:
# Print shape before removing nulls
print("Shape before removing nulls:", bike_master_clean.shape)

# Remove nulls and store in new DataFrame
bike_master_clean_no_nulls = bike_master_clean.dropna()

# Print shape after removing nulls
print("Shape after removing nulls:", bike_master_clean_no_nulls.shape)

# Calculate and print the difference
rows_removed = bike_master_clean.shape[0] - bike_master_clean_no_nulls.shape[0]
print(f"\nTotal rows removed: {rows_removed}")
print(f"Percentage of rows removed: {(rows_removed/bike_master_clean.shape[0]*100):.2f}%")

Shape before removing nulls: (10693997, 13)
Shape after removing nulls: (9572692, 13)

Total rows removed: 1121305
Percentage of rows removed: 10.49%


In [7]:
bike_new = bike_master_clean_no_nulls


In [8]:
bike_new.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5CB9DFCECF79AF84,classic_bike,2021-01-01 00:08:33,2021-01-01 00:33:53,Maine Ave & 9th St SW,31646.0,Rosslyn Metro / Wilson Blvd & Ft Myer Dr,31015.0,38.88044,-77.025236,38.8946,-77.072305,member
2,E74069873161EE33,electric_bike,2021-01-01 00:14:32,2021-01-01 00:28:45,17th & Corcoran St NW,31214.0,14th & Belmont St NW,31119.0,38.912138,-77.038568,38.92087,-77.031691,member
3,91F95E512CABC46A,classic_bike,2021-01-01 00:15:45,2021-01-01 00:21:20,Wilson Blvd. & N. Vermont St.,31926.0,Wilson Blvd. & N. Vermont St.,31926.0,38.879477,-77.114563,38.879477,-77.114563,member
4,DA46A05139C0EA2F,classic_bike,2021-01-01 00:17:46,2021-01-01 00:21:00,11th & Park Rd NW,31651.0,14th & Newton St NW,31649.0,38.931322,-77.028247,38.931991,-77.032956,member
6,0535306B61EF92F9,classic_bike,2021-01-01 00:19:06,2021-01-01 00:36:28,Bladensburg Rd & Benning Rd NE,31617.0,Good Hope Rd & MLK Ave SE,31802.0,38.900413,-76.982872,38.867373,-76.988039,casual


In [11]:
bike_new.duplicated().sum()

np.int64(0)

In [13]:
rides_count = bike_new.groupby(['rideable_type','member_casual'])['ride_id'].count().reset_index()
rides_count

  rides_count = bike_new.groupby(['rideable_type','member_casual'])['ride_id'].count().reset_index()


Unnamed: 0,rideable_type,member_casual,ride_id
0,classic_bike,casual,2690702
1,classic_bike,member,4999767
2,docked_bike,casual,534334
3,docked_bike,member,2
4,electric_bike,casual,544069
5,electric_bike,member,803818


In [15]:
bike_hourly.head()

Unnamed: 0,Casual,Date,Hour,Member,Total_rides,relativehumidity_2m (%),temperature_2m (°C),weathercode (wmo code),windspeed_10m (km/h)
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9


In [16]:
bike_hourly.dtypes

Casual                     float64
Date                        object
Hour                         int64
Member                     float64
Total_rides                float64
relativehumidity_2m (%)      int64
temperature_2m (°C)        float64
weathercode (wmo code)       int64
windspeed_10m (km/h)       float64
dtype: object

In [None]:
bike_hourly_new = bike_hourly.copy() #making a 

In [20]:
bike_hourly_new.head()

Unnamed: 0,Casual,Date,Hour,Member,Total_rides,relativehumidity_2m (%),temperature_2m (°C),weathercode (wmo code),windspeed_10m (km/h)
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9


In [21]:
bike_hourly_new.describe()

Unnamed: 0,Casual,Hour,Member,Total_rides,relativehumidity_2m (%),temperature_2m (°C),weathercode (wmo code),windspeed_10m (km/h)
count,26269.0,26280.0,26269.0,26269.0,26280.0,26280.0,26280.0,26280.0
mean,159.93007,11.5,247.165632,407.095702,62.967884,14.412782,7.228311,10.171298
std,167.330426,6.922318,225.283539,373.005409,22.72432,10.285751,17.630961,5.261033
min,0.0,0.0,0.0,1.0,5.0,-14.6,0.0,0.0
25%,28.0,5.75,55.0,85.0,46.0,6.1,0.0,6.5
50%,105.0,11.5,203.0,319.0,64.0,14.8,1.0,9.2
75%,237.0,17.25,371.0,616.0,82.0,22.7,2.0,12.9
max,1206.0,23.0,1534.0,2262.0,100.0,41.5,75.0,40.1
