# Citibike Project - Normalizing the Datasets

Normalizing the bike share datsets from `03_normalizing_datasets.ipynb` to prepare for SQL database creation.

## Imports

In [1]:
import pandas as pd
import numpy as np

import polars as pl

import pyarrow as pa
import pyarrow.parquet as pq

import feather

from matplotlib import pyplot as plt

## Files that we'll be working on: 
Two .CSVs, `group1` is ~10Gb and `group2` is ~20Gb

In [2]:
group1_location='/Users/sra/files/projects/citibike_project/combined/group1_combined/group1.csv'
group2_location='/Users/sra/files/projects/citibike_project/combined/group2_combined/group2.csv'

### Let's explore group1 first:

In [12]:
q=(
    pl.scan_csv(group1_location,ignore_errors=True,try_parse_dates=True)
)

group1_pl=q.collect()

In [13]:
group1_pl.head()

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
str,str,datetime[μs],datetime[μs],str,f64,str,f64,f64,f64,f64,f64,str
"""26A3DC47FE0EA3…","""docked_bike""",2021-05-13 12:48:08,2021-05-13 13:07:37,"""Broadway & W 2…",6173.08,"""E 2 St & Avenu…",5515.02,40.742868,-73.989186,40.722174,-73.983688,"""member"""
"""A99F2E1D627B08…","""docked_bike""",2021-05-16 08:30:13,2021-05-16 08:45:47,"""46 Ave & 5 St""",6286.02,"""34th Ave & Ver…",6873.01,40.74731,-73.95451,40.765354,-73.939863,"""member"""
"""43E79A45997B73…","""docked_bike""",2021-05-01 08:38:14,2021-05-01 08:54:27,"""46 Ave & 5 St""",6286.02,"""34th Ave & Ver…",6873.01,40.74731,-73.95451,40.765354,-73.939863,"""member"""
"""8B3CC649F4F588…","""docked_bike""",2021-05-09 08:12:31,2021-05-09 08:27:05,"""46 Ave & 5 St""",6286.02,"""34th Ave & Ver…",6873.01,40.74731,-73.95451,40.765354,-73.939863,"""member"""
"""7101C98F057486…","""docked_bike""",2021-05-27 07:52:27,2021-05-27 08:09:01,"""E 123 St & Lex…",7636.05,"""1 Ave & E 78 S…",7020.09,40.802926,-73.9379,40.771404,-73.953517,"""member"""


In [35]:
group1_pl.groupby('start_station_name').count()\
.sort('count',descending=True)\
.head()

start_station_name,count
str,u32
"""W 21 St & 6 Av…",247908
"""West St & Cham…",219580
"""1 Ave & E 68 S…",208483
"""6 Ave & W 33 S…",197135
"""Broadway & W 2…",194687


There are some junk stations in the dataset:

In [48]:
q=(
    group1_pl.lazy()
    .groupby('start_station_name')
    .agg(pl.count('start_station_name').alias('count'))
    .filter(
        (pl.col('count')>=10)
    )
    .sort('count',descending=True)
    # .limit(5)
)

group1_pl_count=q.collect()
print(group1_pl_count)

shape: (2_027, 2)
┌──────────────────────────┬────────┐
│ start_station_name       ┆ count  │
│ ---                      ┆ ---    │
│ str                      ┆ u32    │
╞══════════════════════════╪════════╡
│ W 21 St & 6 Ave          ┆ 247908 │
│ West St & Chambers St    ┆ 219580 │
│ 1 Ave & E 68 St          ┆ 208483 │
│ 6 Ave & W 33 St          ┆ 197135 │
│ …                        ┆ …      │
│ Prototype Lab            ┆ 15     │
│ Apache                   ┆ 15     │
│ Rogers Pl & E 165 St_old ┆ 11     │
│ 4455.10                  ┆ 11     │
└──────────────────────────┴────────┘


In [53]:
q=(
    group1_pl.lazy()
    .groupby('start_station_id')
    .agg(pl.count('start_station_id').alias('count'))
    # .filter(
        # (pl.col('count')>=1)
    # )
    .sort('count',descending=True)
    # .limit(5)
)

group1_pl_count=q.collect()
print(group1_pl_count)

shape: (1_927, 2)
┌──────────────────┬────────┐
│ start_station_id ┆ count  │
│ ---              ┆ ---    │
│ f64              ┆ u32    │
╞══════════════════╪════════╡
│ 6140.05          ┆ 247908 │
│ 5329.03          ┆ 219580 │
│ 6822.09          ┆ 208483 │
│ 6364.07          ┆ 197135 │
│ …                ┆ …      │
│ 5548.01          ┆ 44     │
│ 3704.01          ┆ 33     │
│ 4014.01          ┆ 30     │
│ 8419.03          ┆ 2      │
└──────────────────┴────────┘


## Normalize group1

Ride table:

In [54]:
group1_pl_ridenorm=group1_pl.select(pl.col('*').exclude('start_station_name',
                                         'end_station_name',
                                         'start_lat',
                                         'end_lat',
                                         'start_lng',
                                         'end_lng'))

print(group1_pl_ridenorm)

shape: (55_800_085, 7)
┌──────────────┬────────────┬──────────────┬──────────────┬────────────┬────────────┬──────────────┐
│ ride_id      ┆ rideable_t ┆ started_at   ┆ ended_at     ┆ start_stat ┆ end_statio ┆ member_casua │
│ ---          ┆ ype        ┆ ---          ┆ ---          ┆ ion_id     ┆ n_id       ┆ l            │
│ str          ┆ ---        ┆ datetime[μs] ┆ datetime[μs] ┆ ---        ┆ ---        ┆ ---          │
│              ┆ str        ┆              ┆              ┆ f64        ┆ f64        ┆ str          │
╞══════════════╪════════════╪══════════════╪══════════════╪════════════╪════════════╪══════════════╡
│ 26A3DC47FE0E ┆ docked_bik ┆ 2021-05-13   ┆ 2021-05-13   ┆ 6173.08    ┆ 5515.02    ┆ member       │
│ A3A3         ┆ e          ┆ 12:48:08     ┆ 13:07:37     ┆            ┆            ┆              │
│ A99F2E1D627B ┆ docked_bik ┆ 2021-05-16   ┆ 2021-05-16   ┆ 6286.02    ┆ 6873.01    ┆ member       │
│ 088F         ┆ e          ┆ 08:30:13     ┆ 08:45:47     ┆         

Station table:

In [56]:
group1_pl_stationnorm=group1_pl.select(pl.col('*').exclude('ride_id',
                                         'rideable_type',
                                         'started_at',
                                         'ended_at',
                                         'member_casual'))

print(group1_pl_stationnorm)

shape: (55_800_085, 8)
┌────────────┬────────────┬────────────┬────────────┬───────────┬─────────┬───────────┬────────────┐
│ start_stat ┆ start_stat ┆ end_statio ┆ end_statio ┆ start_lat ┆ start_l ┆ end_lat   ┆ end_lng    │
│ ion_name   ┆ ion_id     ┆ n_name     ┆ n_id       ┆ ---       ┆ ng      ┆ ---       ┆ ---        │
│ ---        ┆ ---        ┆ ---        ┆ ---        ┆ f64       ┆ ---     ┆ f64       ┆ f64        │
│ str        ┆ f64        ┆ str        ┆ f64        ┆           ┆ f64     ┆           ┆            │
╞════════════╪════════════╪════════════╪════════════╪═══════════╪═════════╪═══════════╪════════════╡
│ Broadway & ┆ 6173.08    ┆ E 2 St &   ┆ 5515.02    ┆ 40.742868 ┆ -73.989 ┆ 40.722174 ┆ -73.983688 │
│ W 25 St    ┆            ┆ Avenue B   ┆            ┆           ┆ 186     ┆           ┆            │
│ 46 Ave & 5 ┆ 6286.02    ┆ 34th Ave & ┆ 6873.01    ┆ 40.74731  ┆ -73.954 ┆ 40.765354 ┆ -73.939863 │
│ St         ┆            ┆ Vernon     ┆            ┆           ┆ 51

## Convert the `polars` tables into a form that will make it easy to convert to `MySQL`

In [58]:
import mysql.connector

# Create a DataFrame (replace with your actual DataFrame)
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'],
                   'Age': [25, 30, 35],
                   'City': ['New York', 'London', 'Paris']})

ModuleNotFoundError: No module named 'mysql'

In [None]:
# MySQL connection details
host = 'localhost'
user = 'root'
password = 'rootroot'
database = 'citibike'
table = 'rides'

# Establish a connection to the MySQL server
cnx = mysql.connector.connect(host=host, user=user, password=password, database=database)

# Create a cursor object to execute SQL queries
cursor = cnx.cursor()

# Create table query
create_table_query = f"CREATE TABLE {table} (Name VARCHAR(50), Age INT, City VARCHAR(50))"
cursor.execute(create_table_query)

# Insert data into the table
for _, row in df.iterrows():
    insert_query = f"INSERT INTO {table} (Name, Age, City) VALUES (%s, %s, %s)"
    values = (row['Name'], row['Age'], row['City'])
    cursor.execute(insert_query, values)

# Commit the changes and close the connection
cnx.commit()
cursor.close()
cnx.close()

print(f"Data has been inserted into table '{table}' in database '{database}'.")

In [39]:
group1_pd=pd.read_csv(group1_location)
group1_pd.info()

  group1_pd=pd.read_csv(group1_location)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55800085 entries, 0 to 55800084
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 5.4+ GB


In [75]:
group1_pd['start_station_name'].value_counts()

W 21 St & 6 Ave                 247908
West St & Chambers St           219580
1 Ave & E 68 St                 208483
6 Ave & W 33 St                 197135
Broadway & W 25 St              194687
                                 ...  
MTL-ECO51-1                          2
Pier 40 Dock Station                 2
Anthony Ave & E Burnside Ave         2
Park Ave Depot                       1
Pier 40 X2                           1
Name: start_station_name, Length: 2037, dtype: int64

In [78]:
group1_pd['end_station_name'].value_counts()

W 21 St & 6 Ave          248847
West St & Chambers St    221084
1 Ave & E 68 St          208961
6 Ave & W 33 St          195770
Broadway & W 25 St       195068
                          ...  
7 Ave & Bleecker St           1
StuyTown Depot                1
JCBS Depot                    1
NYCBS DEPOT - PITT            1
York St                       1
Name: end_station_name, Length: 2050, dtype: int64

There seems to be some junk in the columns - I assumed it was clean when I downloaded it from Citibike, which I guess was a mistake. 

I'll need to clean the CSV before I convert the database to be SQL-ready in a future notebook.

In [None]:
group1_pd.isna()

Displaying the value counts to disentangle the issue here:

In [None]:
# Calculate value counts
# counts = df['Column'].value_counts()

# Filter based on value counts
# filtered_df = df[df['Column'].isin(counts[counts >= 100].index)]

# counts=group1_pd['start_station_name'].value_counts()

# group1_pd[group1_pd['start_station_name'].isin(counts[counts >=100].index)].plot(kind='bar')

In [77]:
# group1_pl=group1_pd.

q=(
    pl.scan_csv(group1_location)
    # .filter(pl.col('member_casual')=='member')
    # .groupby('start_station_name')
    # .with_columns([pl.col('count').count().alias('count')])
    # .sort(pl.col(''))
)

df=q.collect()

df

ComputeError: Could not parse `Lab - NYC` as dtype `f64` at column 'start_station_id' (column number 6).
The current offset in the file is 2594650959 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `dtypes` argument
- setting `ignore_errors` to `True`,
- adding `Lab - NYC` to the `null_values` list.

Use `parquvalue_counts store the large dataframes. They are currently in `.CSV` format and we will convert them to `parquet` format now.

In [68]:
group1=pd.read_csv(group1_location)

group1_pq=pa.Table.from_pandas(group1)

parquet_file_path='/Users/sra/files/projects/citibike_project/combined/group1_combined'
pq.write_table(group1_pq, parquet_file_path)

  group1=pd.read_csv(group1_location)


ArrowInvalid: ("Could not convert '6572.08' with type str: tried to convert to double", 'Conversion failed for column start_station_id with type object')

In [65]:
# convert .CSV to .parquet

group1=pd.read_csv(group1_location)
group1=group1.to_parquet(group1_location)
group1.head()

  group1=pd.read_csv(group1_location)


ArrowInvalid: ("Could not convert '6572.08' with type str: tried to convert to double", 'Conversion failed for column start_station_id with type object')

Use the `polars` package to manipulate the large dataframes, `group1` and `group2`.

In [52]:
q=(
    pl.scan_csv(group1_location)
    # .filter(pl.col('member_casual')=='member')
    .groupby('start_station_name')
    # .with_columns([pl.col('count').count().alias('count')])
    # .sort(pl.col(''))
)

df=q.collect()

AttributeError: 'LazyGroupBy' object has no attribute 'collect'

In [46]:
df

start_station_name,count
str,u32
"""W 21 St & 6 Av…",247908
"""West St & Cham…",219580
"""1 Ave & E 68 S…",208483
"""6 Ave & W 33 S…",197135
"""Broadway & W 2…",194687
"""Broadway & E 1…",190655
"""University Pl …",186543
"""Cleveland Pl &…",181162
"""E 33 St & 1 Av…",178145
"""E 17 St & Broa…",176587


In [None]:
q=(
    pl.scan_csv('/Users/sra/files/projects/citibike_project/combined/group1_combined/group1.csv')
    .groupby(by='start_station_name').count()
    .sort(pl.col('count'),descending=True)
)

df=q.collect()

In [None]:
df