## Libraries

In [None]:
import pandas as pd
import os
import json

## Setting up
Load a data sample of 100,000 lines to get started.

### Constants

In [None]:
SOURCE_FILE = os.path.join('..', 'data', 'raw', 'train.csv')
OUTPUT_DIR = os.path.join('..', 'data', 'bronze', 'trips')
SAMPLE_CHUNK_SIZE = 100000

### Data Loading

In [None]:
reader = pd.read_csv(SOURCE_FILE, chunksize=SAMPLE_CHUNK_SIZE)
df_sample = next(reader)

## Step 1: Data Profiling
The goal here is...

### 1. Overview

In [None]:
df_sample.head()

### 2. Features Analysis

In [None]:
df_sample.info()

#### 2.1 `TRIP_ID`

(String) It contains an unique identifier for each trip.

In [None]:
nunique_trip_id = df_sample['TRIP_ID'].nunique()

print(f'Number of unique TRIP_IDs: {nunique_trip_id}')

In [None]:

df_sample[df_sample['TRIP_ID'].duplicated(keep=False)]

#### 2.2 `CALL_TYPE`
(char) It identifies the way used to demand this service. It may contain one of three possible values:
* ‘A’ if this trip was dispatched from the central;
* ‘B’ if this trip was demanded directly to a taxi driver on a specific stand;
* ‘C’ otherwise (i.e. a trip demanded on a random street).

In [None]:
df_sample['CALL_TYPE'].value_counts(normalize=True).sort_index()*100

In [None]:
df_sample['CALL_TYPE'].value_counts().sort_index()

#### 2.3 `ORIGIN_CALL`

(integer) It contains an unique identifier for each phone number which was used to demand, at least, one service. It identifies the trip’s customer if CALL_TYPE=’A’. Otherwise, it assumes a NULL value.

In [None]:
df_sample['ORIGIN_CALL'].groupby(df_sample['CALL_TYPE']).agg(['count', 'nunique']) # ok

In [None]:
df_sample[df_sample['CALL_TYPE'] == 'A']['CALL_TYPE'].count() # ok, there's the same number of ORIGIN_CALL non-null values

#### 2.4 `ORIGIN_STAND`

(integer): It contains an unique identifier for the taxi stand. It identifies the starting point of the trip if CALL_TYPE=’B’. Otherwise, it assumes a NULL value.

In [None]:
origin_stands_number = df_sample['ORIGIN_STAND'].count()
print(f'Number of non-null ORIGIN_STAND values: {origin_stands_number}')

In [None]:
df_sample['ORIGIN_STAND'].groupby(df_sample['CALL_TYPE']).agg(['count', 'nunique']) # ok

In [None]:
b_call_type_number = df_sample[df_sample['CALL_TYPE'] == 'B']['CALL_TYPE'].count()
print(f'Number of CALL_TYPE B values: {b_call_type_number}')

In [None]:
diff_origin_stands = b_call_type_number - origin_stands_number
print(f'Difference between CALL_TYPE B count and non-null ORIGIN_STAND count: {diff_origin_stands}')
print('This difference should be zero if every CALL_TYPE B has a corresponding ORIGIN_STAND value.')


#### 2.5 `TAXI_ID`

(integer): It contains an unique identifier for the taxi driver that performed each trip.

In [None]:
df_sample['TAXI_ID'].nunique()

#### 2.6 `TIMESTAMP`

(integer) Unix Timestamp (in seconds). It identifies the trip’s start.

In [None]:
df_sample['TIMESTAMP'].nunique()

#### 2.7 `DAYTYPE`

(char) It identifies the daytype of the trip’s start. It assumes one of three possible values:
* ‘B’ if this trip started on a holiday or any other special day (i.e. extending holidays, floating holidays, etc.);
* ‘C’ if the trip started on a day before a type-B day;
* ‘A’ otherwise (i.e. a normal day, workday or weekend).

In [None]:
df_sample['DAY_TYPE'].value_counts().sort_index()

#### 2.8 `MISSING_DATA`

(Boolean) It is FALSE when the GPS data stream is complete and TRUE whenever one (or more) locations are missing.

In [None]:
df_sample['MISSING_DATA'].value_counts()

#### 2.9 `POLYLINE`

(String): It contains a list of GPS coordinates (i.e. WGS84 format) mapped as a string. The beginning and the end of the string are identified with brackets (i.e. [ and ], respectively). Each pair of coordinates is also identified by the same brackets as [LONGITUDE, LATITUDE]. This list contains one pair of coordinates for each 15 seconds of trip. The last list item corresponds to the trip’s destination while the first one represents its start.

In [None]:
df_sample['POLYLINE'][0]

In [None]:
type(df_sample['POLYLINE'][0])

### 3. Results

## Step 2: Data Cleaning

### 1. `POLYLINE`

In [None]:
for index, row in df_sample.iterrows():
    trip_dict = row.to_dict()
    polyline_text = trip_dict['POLYLINE']
    try:
        polyline_list = json.loads(polyline_text)
        trip_dict['POLYLINE'] = polyline_list
        trip_id = trip_dict['TRIP_ID']
        filename = f"{OUTPUT_DIR}/{trip_id}.json"
        with open(filename, 'w') as f:
            json.dump(trip_dict, f)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
        polyline_list = []