# Imports

In [4]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point


# Initial Data Processing

## Loading and inspecting the data

In [5]:
locations = pd.read_csv('data/Capital_Bikeshare_Locations.csv')
daily_rent = pd.read_parquet('data/daily-rent.parquet')
daily_rent_sampled = pd.read_parquet('data/daily-rent-sampled.parquet')
daily_rent_sampled.to_csv('data/daily-rent-sampled.csv', index=False)
stations = pd.read_csv('data/stations.csv')
weather = pd.read_csv('data/Washington,DC,USA 2024-01-01 to 2024-12-31.csv')
metro_bus_stops = pd.read_csv('data/Metro_Bus_Stops.csv')
shuttle_bus_stops = pd.read_csv('data/Shuttle_Bus_Stops.csv')

cbd = gpd.read_file('data/DDOT_Central_Business_District.geojson')
parking_zones = gpd.read_file('data/Residential_and_Visitor_Parking_Zones.geojson')

In [6]:
print(locations.shape)
print('--------------------------------')

print(locations.head())
print('--------------------------------')

locations.info()
print('--------------------------------')

print(locations.describe())
print('--------------------------------')

print(locations.isnull().sum())
print('--------------------------------')

print(locations.duplicated().sum())
print('--------------------------------')

print(locations.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          locations.dtypes,
    'missing_val':        locations.isnull().sum(),
    'missing_val_ratio':  (locations.isnull().sum() / locations.shape[0] * 100).round().astype(int),
    'unique_vals':        locations.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(794, 29)
--------------------------------
              X             Y                                            NAME  \
0 -8.577104e+06  4.705679e+06                                Lincoln Memorial   
1 -8.609481e+06  4.714716e+06    W&OD Trail/Sunset Hills Rd & Isaac Newton Sq   
2 -8.575867e+06  4.705657e+06                   17th St & Independence Ave SW   
3 -8.574188e+06  4.706622e+06                                   8th & D St NW   
4 -8.567161e+06  4.706864e+06  Anacostia Ave & Benning Rd NE / River Terrace    

  STATION_TYPE                            STATION_ID  STATION_STATUS  \
0      classic  08254284-1f3f-11e7-bf6b-3863bb334450             NaN   
1      classic  08263fbd-1f3f-11e7-bf6b-3863bb334450             NaN   
2      classic  082623bf-1f3f-11e7-bf6b-3863bb334450             NaN   
3      classic  08256ac9-1f3f-11e7-bf6b-3863bb334450             NaN   
4      classic  082518eb-1f3f-11e7-bf6b-3863bb334450             NaN   

            LAST_REPORTED  NUM_DOCKS_

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
X,float64,0,0,794
Y,float64,0,0,794
NAME,object,0,0,794
STATION_TYPE,object,0,0,1
STATION_ID,object,0,0,794
STATION_STATUS,float64,794,100,0
LAST_REPORTED,object,0,0,125
NUM_DOCKS_AVAILABLE,int64,0,0,33
NUM_DOCKS_DISABLED,int64,0,0,3
NUM_BIKES_AVAILABLE,int64,0,0,31


In [7]:
print(daily_rent.shape)
print('--------------------------------')

print(daily_rent.head())
print('--------------------------------')

daily_rent.info()
print('--------------------------------')

print(daily_rent.describe())
print('--------------------------------')

print(daily_rent.isnull().sum())
print('--------------------------------')

print(daily_rent.duplicated().sum())
print('--------------------------------')

print(daily_rent.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          daily_rent.dtypes,
    'missing_val':        daily_rent.isnull().sum(),
    'missing_val_ratio':  (daily_rent.isnull().sum() / daily_rent.shape[0] * 100).round().astype(int),
    'unique_vals':        daily_rent.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(6114323, 13)
--------------------------------
            ride_id  rideable_type          started_at            ended_at  \
0  748A93D7DE8A41CD   classic_bike 2024-01-25 15:49:59 2024-01-25 15:52:35   
1  75CBFD136F06305B   classic_bike 2024-01-02 16:44:58 2024-01-02 16:53:25   
2  0536C9720F87E04C   classic_bike 2024-01-24 15:40:15 2024-01-24 15:43:55   
3  9E17390C218783B5   classic_bike 2024-01-04 15:35:00 2024-01-04 15:37:35   
4  00727D0E773CDFF7  electric_bike 2024-01-05 12:27:58 2024-01-05 12:35:40   

  start_station_name start_station_id     end_station_name end_station_id  \
0      1st & O St NW            31519        1st & L St NW          31677   
1      1st & O St NW            31519  4th & College St NW          31138   
2      1st & O St NW            31519        1st & L St NW          31677   
3      1st & O St NW            31519        1st & L St NW          31677   
4      1st & O St NW            31519       10th & G St NW          31274   

   start_lat  start_l

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
ride_id,string[python],0,0,6114182
rideable_type,string[python],0,0,2
started_at,datetime64[ns],0,0,5866403
ended_at,datetime64[ns],0,0,5869571
start_station_name,string[python],1190966,19,818
start_station_id,string[python],1190966,19,803
end_station_name,string[python],1236167,20,818
end_station_id,string[python],1237694,20,803
start_lat,float64,0,0,468450
start_lng,float64,0,0,501510


In [8]:
print(stations.shape)
print('--------------------------------')

print(stations.head())
print('--------------------------------')

stations.info()
print('--------------------------------')

print(stations.describe())
print('--------------------------------')

print(stations.isnull().sum())
print('--------------------------------')

print(stations.duplicated().sum())
print('--------------------------------')

print(stations.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          stations.dtypes,
    'missing_val':        stations.isnull().sum(),
    'missing_val_ratio':  (stations.isnull().sum() / stations.shape[0] * 100).round().astype(int),
    'unique_vals':        stations.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(821, 2)
--------------------------------
      id                          name
0  30200  9th St & Pennsylvania Ave NW
1  30201                 9th & G St NW
2  31000           Eads St & 15th St S
3  31002        Crystal Dr & 20th St S
4  31003        Crystal Dr & 15th St S
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 821 entries, 0 to 820
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      821 non-null    int64 
 1   name    821 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.0+ KB
--------------------------------
                 id
count    821.000000
mean   31650.894032
std      447.232558
min    30200.000000
25%    31263.000000
50%    31630.000000
75%    32022.000000
max    33200.000000
--------------------------------
id      0
name    0
dtype: int64
--------------------------------
0
--------------------------------
id      804
name    820
dtype: int64
--------

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
id,int64,0,0,804
name,object,0,0,820


In [9]:
print(weather.shape)
print('--------------------------------')

print(weather.head())
print('--------------------------------')

weather.info()
print('--------------------------------')

print(weather.describe())
print('--------------------------------')

print(weather.isnull().sum())
print('--------------------------------')

print(weather.duplicated().sum())
print('--------------------------------')

print(weather.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          weather.dtypes,
    'missing_val':        weather.isnull().sum(),
    'missing_val_ratio':  (weather.isnull().sum() / weather.shape[0] * 100).round().astype(int),
    'unique_vals':        weather.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(366, 17)
--------------------------------
                name    datetime  tempmax  tempmin  temp  humidity  windspeed  \
0  Washington,DC,USA  2024-01-01      6.1      3.8   4.9      74.7       21.9   
1  Washington,DC,USA  2024-01-02      7.7      2.0   4.7      57.3       29.2   
2  Washington,DC,USA  2024-01-03      8.8      0.9   4.4      59.5       14.2   
3  Washington,DC,USA  2024-01-04      8.1      0.7   4.7      55.2       35.0   
4  Washington,DC,USA  2024-01-05      4.4     -2.4   0.9      55.0       21.3   

   windspeedmax  windspeedmean  windspeedmin  cloudcover              sunrise  \
0          21.9           11.9           0.0        90.3  2024-01-01T07:26:50   
1          29.2           18.6           6.7        47.5  2024-01-02T07:26:58   
2          14.2            8.9           0.7        68.7  2024-01-03T07:27:04   
3          35.0           17.0           0.0        68.5  2024-01-04T07:27:07   
4          21.3           10.6           5.1        44.5  2024-01

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
name,object,0,0,1
datetime,object,0,0,366
tempmax,float64,0,0,180
tempmin,float64,0,0,200
temp,float64,0,0,222
humidity,float64,0,0,276
windspeed,float64,0,0,188
windspeedmax,float64,0,0,188
windspeedmean,float64,0,0,153
windspeedmin,float64,0,0,73


In [10]:
print(metro_bus_stops.shape)
print('--------------------------------')

print(metro_bus_stops.head())
print('--------------------------------')

metro_bus_stops.info()
print('--------------------------------')

print(metro_bus_stops.describe())
print('--------------------------------')

print(metro_bus_stops.isnull().sum())
print('--------------------------------')

print(metro_bus_stops.duplicated().sum())
print('--------------------------------')

print(metro_bus_stops.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          metro_bus_stops.dtypes,
    'missing_val':        metro_bus_stops.isnull().sum(),
    'missing_val_ratio':  (metro_bus_stops.isnull().sum() / metro_bus_stops.shape[0] * 100).round().astype(int),
    'unique_vals':        metro_bus_stops.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(10044, 79)
--------------------------------
              X             Y  EGIS_ID   REG_ID  BSTP_GEO_ID BSTP_OPS_TCD  \
0 -8.581259e+06  4.715817e+06  1311739  1003448        15663          PRS   
1 -8.540493e+06  4.714288e+06  1310541  3003035        14833          PRS   
2 -8.591135e+06  4.698290e+06  1302469  5000849         2134          PRS   
3 -8.577431e+06  4.682321e+06  1305286  5001858        11889          PRS   
4 -8.578561e+06  4.680594e+06  1305279  5001885        11919          PRS   

            BSTP_EFF_DATE BSTP_TCD               AT_STR            ON_STR  \
0  2011/08/22 04:00:00+00      UNK           44TH ST NW     JENIFER ST NW   
1  2001/12/30 05:00:00+00      REV          HARBOUR WAY  MITCHELLVILLE RD   
2  1901/01/01 05:00:00+00      REV  SLEEPY HOLLOW MANOR     COLUMBIA PIKE   
3  1974/11/25 05:00:00+00      REV           CUSHMAN PL    WAYNEWOOD BLVD   
4  1901/01/01 05:00:00+00      REV            PRICES LN    VERNON VIEW DR   

   ...  SE_ANNO_CAD_DATA     

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
X,float64,0,0,10044
Y,float64,0,0,10044
EGIS_ID,int64,0,0,10044
REG_ID,int64,0,0,10044
BSTP_GEO_ID,int64,0,0,9904
...,...,...,...,...
EDITED,object,0,0,1
OBJECTID,int64,0,0,10044
SNOWPRIORITY,object,9497,95,1
BSTP_OPS_FTU_TCD,object,0,0,1


In [11]:
print(shuttle_bus_stops.shape)
print('--------------------------------')

print(shuttle_bus_stops.head())
print('--------------------------------')

shuttle_bus_stops.info()
print('--------------------------------')

print(shuttle_bus_stops.describe())
print('--------------------------------')

print(shuttle_bus_stops.isnull().sum())
print('--------------------------------')

print(shuttle_bus_stops.duplicated().sum())
print('--------------------------------')

print(shuttle_bus_stops.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          shuttle_bus_stops.dtypes,
    'missing_val':        shuttle_bus_stops.isnull().sum(),
    'missing_val_ratio':  (shuttle_bus_stops.isnull().sum() / shuttle_bus_stops.shape[0] * 100).round().astype(int),
    'unique_vals':        shuttle_bus_stops.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(102, 29)
--------------------------------
              X             Y               COMPANY  \
0 -8.581105e+06  4.715777e+06     Reston Limousine    
1 -8.571861e+06  4.714828e+06     Reston Limousine    
2 -8.580446e+06  4.714237e+06  American University    
3 -8.580342e+06  4.714213e+06  American University    
4 -8.580407e+06  4.713930e+06  American University    

                                             ADDRESS  \
0                             4350 JENIFER STREET NW   
1                             550 GALLOWAY STREET NE   
2  TENLEYTOWN - AU METRO STATION ENTRANCE - NORTH...   
3              FORT DRIVE NW AND ALBEMARLE STREET NW   
4            NEBRASKA AVENUE NW AND TENLEY CIRCLE NW   

                      ATTRACTION  \
0                Mazza Gallarie    
1                            NaN   
2              Tenley Town Metro   
3             Metro on Fort road   
4  Tenley campus to Main Campus    

                                          CROSSROADS METERS METRO_BUS_ZO

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
X,float64,0,0,96
Y,float64,0,0,96
COMPANY,object,0,0,25
ADDRESS,object,0,0,98
ATTRACTION,object,26,25,71
CROSSROADS,object,18,18,80
METERS,object,7,7,3
METRO_BUS_ZONE,object,9,9,3
METRO_STOP_ID,object,79,77,22
OTHER,object,25,25,21


In [12]:
print(cbd.shape)
print('--------------------------------')

print(cbd.head())
print('--------------------------------')


(1, 11)
--------------------------------
   OBJECTID  ID      GIS_ID                                GLOBALID CREATOR  \
0         1   0  DDOT_CBD_1  {88E0BE6C-77A4-4C9B-96DA-39C3D223AA41}    None   

  CREATED EDITOR EDITED  SHAPEAREA  SHAPELEN  \
0    None   None   None          0         0   

                                            geometry  
0  POLYGON ((-77.05028 38.91194, -77.05029 38.911...  
--------------------------------


In [13]:
print(parking_zones.shape)
print('--------------------------------')

print(parking_zones.head())
print('--------------------------------')


(40, 14)
--------------------------------
     NAME  RPP_ZONE ANC_ID                                            WEB_URL  \
0  ANC 4A         4     4A  http://anc.dc.gov/page/advisory-neighborhood-c...   
1  ANC 3G         3     3G  http://anc.dc.gov/page/advisory-neighborhood-c...   
2  ANC 4B         4     4B  http://anc.dc.gov/page/advisory-neighborhood-c...   
3  ANC 3F         3     3F  http://anc.dc.gov/page/advisory-neighborhood-c...   
4  ANC 4C         4     4C  http://anc.dc.gov/page/advisory-neighborhood-c...   

                        GIS_ID    SHAPE_LENG CREATOR CREATED EDITOR EDITED  \
0  Res_Visitor_Parking_Zone_17  19149.358478    None    None   None   None   
1  Res_Visitor_Parking_Zone_16  13274.053932    None    None   None   None   
2  Res_Visitor_Parking_Zone_10  10937.574323    None    None   None   None   
3  Res_Visitor_Parking_Zone_15  11804.927060    None    None   None   None   
4   Res_Visitor_Parking_Zone_7   9774.918003    None    None   None   None   

  

In [14]:
cbd   = cbd.to_crs(epsg=4326)
parking_zones = parking_zones.to_crs(epsg=4326)

## Data Cleanup, Merging, and Imputation.

### Stations and Locations

In [22]:
stations_master = locations[['STATION_ID','NAME','CAPACITY','LATITUDE','LONGITUDE']]
stations_master = stations_master.rename(columns={'STATION_ID':'station_id','NAME':'station_name','CAPACITY':'capacity','LATITUDE':'lat','LONGITUDE':'lon'})
stations['id']=stations['id'].astype(str)
stations_master = stations_master.merge(stations, left_on='station_id', right_on='id', how='left')
stations_master['station_name'] = stations_master['station_name'].fillna(stations_master['name'])
stations_master = stations_master.drop(columns=['id','name'])
stations_master = stations_master.dropna(subset=['lat','lon'])
stations_master.to_parquet("stations_master.parquet", index=False)

In [23]:
print(stations_master.shape)
print('--------------------------------')

print(stations_master.head())
print('--------------------------------')

stations_master.info()
print('--------------------------------')

print(stations_master.describe())
print('--------------------------------')

print(stations_master.isnull().sum())
print('--------------------------------')

print(stations_master.duplicated().sum())
print('--------------------------------')

print(stations_master.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          stations_master.dtypes,
    'missing_val':        stations_master.isnull().sum(),
    'missing_val_ratio':  (stations_master.isnull().sum() / stations_master.shape[0] * 100).round().astype(int),
    'unique_vals':        stations_master.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(794, 5)
--------------------------------
                             station_id  \
0  08254284-1f3f-11e7-bf6b-3863bb334450   
1  08263fbd-1f3f-11e7-bf6b-3863bb334450   
2  082623bf-1f3f-11e7-bf6b-3863bb334450   
3  08256ac9-1f3f-11e7-bf6b-3863bb334450   
4  082518eb-1f3f-11e7-bf6b-3863bb334450   

                                     station_name  capacity        lat  \
0                                Lincoln Memorial        25  38.888255   
1    W&OD Trail/Sunset Hills Rd & Isaac Newton Sq        19  38.951419   
2                   17th St & Independence Ave SW        19  38.888097   
3                                   8th & D St NW        24  38.894851   
4  Anacostia Ave & Benning Rd NE / River Terrace         15  38.896544   

         lon  
0 -77.049437  
1 -77.340281  
2 -77.038325  
3 -77.023240  
4 -76.960120  
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794 entries, 0 to 793
Data columns (total 5 columns):
 #   Column        Non-Null

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
station_id,object,0,0,794
station_name,object,0,0,794
capacity,int64,0,0,29
lat,float64,0,0,789
lon,float64,0,0,792


### Daily Rent

In [36]:
# 1. Load & copy
daily_rent_clean             = daily_rent.copy()

# 2. Parse timestamps & compute duration
daily_rent_clean.rename(columns={"started_at":"start_time","ended_at":"end_time"}, inplace=True)
daily_rent_clean["start_time"] = pd.to_datetime(daily_rent_clean["start_time"], errors="coerce")
daily_rent_clean["end_time"]   = pd.to_datetime(daily_rent_clean["end_time"],   errors="coerce")
daily_rent_clean["trip_duration_mins"] = (
    daily_rent_clean["end_time"] - daily_rent_clean["start_time"]
).dt.total_seconds() / 60

# 3. Cast IDs & categories to string
daily_rent_clean["ride_id"]          = daily_rent_clean["ride_id"].astype("string")
daily_rent_clean["rideable_type"]    = daily_rent_clean["rideable_type"].astype("category")
daily_rent_clean["member_casual"]    = daily_rent_clean["member_casual"].astype("category")
daily_rent_clean["start_station_id"] = daily_rent_clean["start_station_id"].astype("string")
daily_rent_clean["end_station_id"]   = daily_rent_clean["end_station_id"].astype("string")

# 4. Drop unrecoverable rows & out-of-range trips
daily_rent_clean.dropna(subset=["start_time","end_time"], inplace=True)
daily_rent_clean = daily_rent_clean[
    (daily_rent_clean["start_time"].dt.year == 2024) &
    (daily_rent_clean["end_time"].  dt.year == 2024)
]
daily_rent_clean = daily_rent_clean.query("trip_duration_mins > 0 and trip_duration_mins <= 1440")

# 5. Load stations_master & build GeoDataFrame
stations_master = pd.read_parquet("stations_master.parquet")
stations_master["station_id"] = stations_master["station_id"].astype("string")
stations_gdf = gpd.GeoDataFrame(
    stations_master,
    geometry=gpd.points_from_xy(
        stations_master["lon"], stations_master["lat"]
    ),
    crs="EPSG:4326"
)
stations_m = stations_gdf.to_crs(epsg=6933)

# 6. Impute missing start_station_id where lat/lng available
mask_start = daily_rent_clean["start_station_id"].isna() & daily_rent_clean["start_lat"].notna() & daily_rent_clean["start_lng"].notna()
daily_start = daily_rent_clean.loc[mask_start].copy()
daily_start_gdf = gpd.GeoDataFrame(
    daily_start,
    geometry=gpd.points_from_xy(
        daily_start["start_lng"], daily_start["start_lat"]
    ),
    crs="EPSG:4326"
)
daily_s_m = daily_start_gdf.to_crs(epsg=6933)

nearest = [
    (idx, stations_m.loc[stations_m.geometry.distance(row.geometry).idxmin(), "station_id"])
    for idx, row in daily_s_m.iterrows()
]
idxs, vals = zip(*nearest)
fill_start = pd.Series(vals, index=idxs, dtype="string")
daily_rent_clean.loc[fill_start.index, "start_station_id"] = fill_start

# 7. Impute missing end_station_id where lat/lng available
mask_end = daily_rent_clean["end_station_id"].isna() & daily_rent_clean["end_lat"].notna() & daily_rent_clean["end_lng"].notna()
daily_end = daily_rent_clean.loc[mask_end].copy()
daily_end_gdf = gpd.GeoDataFrame(
    daily_end,
    geometry=gpd.points_from_xy(
        daily_end["end_lng"], daily_end["end_lat"]
    ),
    crs="EPSG:4326"
)
daily_e_m = daily_end_gdf.to_crs(epsg=6933)

nearest_end = [
    (idx, stations_m.loc[stations_m.geometry.distance(row.geometry).idxmin(), "station_id"])
    for idx, row in daily_e_m.iterrows()
]
end_idxs, end_vals = zip(*nearest_end)
fill_end = pd.Series(end_vals, index=end_idxs, dtype="string")
daily_rent_clean.loc[fill_end.index, "end_station_id"] = fill_end

# 8. Drop rides still missing station IDs
daily_rent_clean.dropna(subset=["start_station_id","end_station_id"], inplace=True)

# 9. Drop original station name columns to avoid duplicates
daily_rent_clean.drop(columns=["start_station_name","end_station_name"], errors="ignore", inplace=True)

# 10. Merge station names (start & end)
daily_rent_clean = daily_rent_clean.merge(
    stations_master[["station_id","station_name"]],
    left_on="start_station_id", right_on="station_id", how="left"
).rename(columns={"station_name":"start_station_name"}) \
 .drop(columns=["station_id"])

daily_rent_clean = daily_rent_clean.merge(
    stations_master[["station_id","station_name"]],
    left_on="end_station_id", right_on="station_id", how="left"
).rename(columns={"station_name":"end_station_name"}) \
 .drop(columns=["station_id"])

# 12. Save cleaned dataset
daily_rent_clean.to_parquet("daily_rent_cleaned.parquet", index=False)


In [37]:
print(daily_rent_clean.shape)
print('--------------------------------')

print(daily_rent_clean.head())
print('--------------------------------')


print(daily_rent_clean.describe())
print('--------------------------------')

print(daily_rent_clean.isnull().sum())
print('--------------------------------')

print(daily_rent_clean.duplicated().sum())
print('--------------------------------')

print(daily_rent_clean.nunique())
print('--------------------------------')

info_df = pd.DataFrame({
    'data_type':          daily_rent_clean.dtypes,
    'missing_val':        daily_rent_clean.isnull().sum(),
    'missing_val_ratio':  (daily_rent_clean.isnull().sum() / daily_rent_clean.shape[0] * 100).round().astype(int),
    'unique_vals':        daily_rent_clean.nunique()
})

# info_df = info_df.reset_index().rename(columns={'index': 'column_name'})

info_df


(6108141, 14)
--------------------------------
            ride_id  rideable_type          start_time            end_time  \
0  748A93D7DE8A41CD   classic_bike 2024-01-25 15:49:59 2024-01-25 15:52:35   
1  75CBFD136F06305B   classic_bike 2024-01-02 16:44:58 2024-01-02 16:53:25   
2  0536C9720F87E04C   classic_bike 2024-01-24 15:40:15 2024-01-24 15:43:55   
3  9E17390C218783B5   classic_bike 2024-01-04 15:35:00 2024-01-04 15:37:35   
4  00727D0E773CDFF7  electric_bike 2024-01-05 12:27:58 2024-01-05 12:35:40   

  start_station_id end_station_id  start_lat  start_lng    end_lat    end_lng  \
0            31519          31677  38.908643 -77.012365  38.903819 -77.011987   
1            31519          31138  38.908643 -77.012365  38.921233 -77.018135   
2            31519          31677  38.908643 -77.012365  38.903819 -77.011987   
3            31519          31677  38.908643 -77.012365  38.903819 -77.011987   
4            31519          31274  38.908690 -77.012317  38.898243 -77.026235  

Unnamed: 0,data_type,missing_val,missing_val_ratio,unique_vals
ride_id,string[python],0,0,6108015
rideable_type,category,0,0,2
start_time,datetime64[ns],0,0,5860869
end_time,datetime64[ns],0,0,5864042
start_station_id,string[python],0,0,1211
end_station_id,string[python],0,0,1211
start_lat,float64,0,0,468355
start_lng,float64,0,0,501396
end_lat,float64,0,0,1016
end_lng,float64,0,0,1045


### Weather

In [17]:
weather_clean = weather.rename(columns={'date':'date','TMAX':'TMAX','TMIN':'TMIN','PRCP':'PRCP'})
weather_clean ['date'] = pd.to_datetime(weather_clean['date']).dt.date
weather_clean ['PRCP'] = weather_raw['conditions'].str.contains('Rain|Snow', case=False).astype(float)
weather["PRCP"] = weather["PRCP"].fillna(method="ffill")
weather_clean [['TMAX','TMIN']] = weather_clean [['TMAX','TMIN']].apply(pd.to_numeric, errors='coerce')
weather_clean = weather_clean.sort_values('date').interpolate()
weather_clean = weather_clean[['date','TMAX','TMIN','PRCP']]

KeyError: 'date'