In [1]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import os
import argparse

In [2]:
dataset_1 = pd.read_csv('2000_0.csv')

In [3]:
dataset_1.shape

(4372, 3)

In [4]:
dataset_1.head

<bound method NDFrame.head of       latitude  longitude             timestamp
0     41.51166  -74.08417  2023-12-11T10:50:09Z
1     41.51176  -74.08371  2023-12-11T10:50:11Z
2     41.51176  -74.08371  2023-12-11T10:50:11Z
3     41.51088  -74.08337  2023-12-11T10:50:17Z
4     41.51077  -74.08338  2023-12-11T10:50:18Z
...        ...        ...                   ...
4367  40.69711  -74.19120  2023-12-11T13:58:38Z
4368  40.69711  -74.19120  2023-12-11T13:58:38Z
4369  40.69689  -74.19133  2023-12-11T13:58:40Z
4370  40.69672  -74.19143  2023-12-11T13:58:41Z
4371  40.69663  -74.19143  2023-12-11T13:58:42Z

[4372 rows x 3 columns]>

In [5]:
type(dataset_1)

pandas.core.frame.DataFrame

In [10]:
dataset_2 = pd.read_csv('2000_1.csv')

In [11]:
dataset_2.shape

(2393, 3)

In [12]:
type(dataset_2)

pandas.core.frame.DataFrame

In [15]:
dataset_1.info

<bound method DataFrame.info of       latitude  longitude             timestamp
0     41.51166  -74.08417  2023-12-11T10:50:09Z
1     41.51176  -74.08371  2023-12-11T10:50:11Z
2     41.51176  -74.08371  2023-12-11T10:50:11Z
3     41.51088  -74.08337  2023-12-11T10:50:17Z
4     41.51077  -74.08338  2023-12-11T10:50:18Z
...        ...        ...                   ...
4367  40.69711  -74.19120  2023-12-11T13:58:38Z
4368  40.69711  -74.19120  2023-12-11T13:58:38Z
4369  40.69689  -74.19133  2023-12-11T13:58:40Z
4370  40.69672  -74.19143  2023-12-11T13:58:41Z
4371  40.69663  -74.19143  2023-12-11T13:58:42Z

[4372 rows x 3 columns]>

In [17]:
pd.isnull (dataset_1).sum()

latitude     0
longitude    0
timestamp    0
dtype: int64

In [18]:
pd.isnull(dataset_2).sum()

latitude     0
longitude    0
timestamp    0
dtype: int64

In [None]:
# Define the command-line arguments
parser = argparse.ArgumentParser(description='Process GPS data from a Parquet file.')
parser.add_argument('--to_process', type=str, required=True, help='Path to the Parquet file to be processed.')
parser.add_argument('--output_dir', type=str, required=True, help='The folder to store the resulting CSV files.')
args = parser.parse_args()

In [None]:
# Read the Parquet file into a pandas dataframe
df = pq.read_table(args.to_process).to_pandas()

In [None]:
# Convert the timestamp column to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
# Sort the dataframe by unit and timestamp
df = df.sort_values(by=['unit', 'timestamp'])

In [None]:
# Create a new column to indicate the start of a new trip
df['new_trip'] = (df['timestamp'].diff() > pd.Timedelta(hours=7)) | (df['unit'] != df['unit'].shift())

In [None]:
# Assign a trip number to each row based on the new_trip column
df['trip_number'] = df['new_trip'].cumsum() - 1

In [None]:

# Loop through the unique units and trips in the dataframe
for unit, trip in df.groupby(['unit', 'trip_number']):
    # Create a sub-dataframe for each unit and trip
    sub_df = trip[['latitude', 'longitude', 'timestamp']]
    # Create a file name based on the unit and trip number
    file_name = f'{unit[0]}_{unit[1]}.csv'
    # Save the sub-dataframe as a CSV file in the output directory
    sub_df.to_csv(os.path.join(args.output_dir, file_name), index=False)