# Import libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone
import folium
from folium import plugins
from tqdm import tqdm
import seaborn as sns

tqdm.pandas()
# %load_ext nb_black

# Load data

In [None]:
taxi = pd.read_csv(
    "../input/taxi-trajectory-data-extended/train_extended.csv.zip",
    sep=",",
    compression="zip",
    low_memory=False,
)

In [None]:
taxi.head()

In [None]:
taxi.info()

# Descriptive analysis 

## Call type

CALL_TYPE: (char) It identifies the way used to demand this service.

It may contain one of three possible values:

* ‘A’ if this trip was dispatched from the central;
* ‘B’ if this trip was demanded directly to a taxi driver on a specific stand;
* ‘C’ otherwise (i.e. a trip demanded on a random street).

In [None]:
taxi.CALL_TYPE.describe()

In [None]:
call_type_count = taxi.CALL_TYPE.value_counts(sort=False).sort_index()
call_type_count.index = ["CENTRAL", "STAND", "OTHER"]
print(call_type_count)

In [None]:
sns.set(rc={"figure.figsize": (16, 6)})
ax = sns.barplot(x=call_type_count.index, y=call_type_count.values)

## Origin call

ORIGIN_CALL: (integer) It contains an unique identifier for each phone number which was used to demand, at least, one service.

It identifies the trip’s customer if CALL_TYPE=’A’. Otherwise, it assumes a NULL value;

In [None]:
taxi.ORIGIN_CALL = (
    taxi.ORIGIN_CALL.fillna(-1)
    .astype("int64")
    .astype(str)
    .replace("-1", np.nan)
)
origin_call_cat = taxi.ORIGIN_CALL.astype("category")
origin_call_cat.describe()

In [None]:
origin_call_count = origin_call_cat.value_counts()
pd.cut(origin_call_count, bins=[0, 1, 2, 3, 4, 6, 10, 100, 10000]).value_counts(
    sort=False
)

## Origin stand

ORIGIN_STAND: (integer): It contains an unique identifier for the taxi stand.

It identifies the starting point of the trip if CALL_TYPE=’B’. Otherwise, it assumes a NULL value;

In [None]:
taxi.ORIGIN_STAND = (
    taxi.ORIGIN_STAND.fillna(-1)
    .astype("int64")
    .astype(str)
    .replace("-1", np.nan)
)
origin_stand_cat = taxi.ORIGIN_STAND.astype("category")
origin_stand_cat.describe()

In [None]:
origin_stand_count = origin_stand_cat.value_counts(sort=True)
ax = sns.barplot(
    x=origin_stand_count.index,
    y=origin_stand_count.values,
    order=origin_stand_count.index,
)

## Taxi ID

TAXI_ID: (integer): It contains an unique identifier for the taxi driver that performed each trip;

In [None]:
taxi_id_cat = taxi.TAXI_ID.astype("category")
taxi_id_cat.describe()

In [None]:
taxi_id_count = taxi_id_cat.value_counts(sort=True)
ax = sns.violinplot(y=taxi_id_count.values, cut=0)

## Timestamp

TIMESTAMP: (integer) Unix Timestamp (in seconds). It identifies the trip’s start; 

In [None]:
taxi.TIMESTAMP.count()

In [None]:
datetime.fromtimestamp(taxi.TIMESTAMP.min(), timezone.utc).strftime(
    "%Y-%m-%d %H:%M:%S"
)

In [None]:
datetime.fromtimestamp(taxi.TIMESTAMP.max(), timezone.utc).strftime(
    "%Y-%m-%d %H:%M:%S"
)

## Day type

DAYTYPE: (char) It identifies the daytype of the trip’s start.

It assumes one of three possible values:

* ‘B’ if this trip started on a holiday or any other special day (i.e. extending holidays, floating holidays, etc.);
* ‘C’ if the trip started on a day before a type-B day;
* ‘A’ otherwise (i.e. a normal day, workday or weekend).

In [None]:
taxi.DAY_TYPE.describe()

## Missing data

MISSING_DATA: (Boolean) It is FALSE when the GPS data stream is complete and TRUE whenever one (or more) locations are missing

In [None]:
taxi.MISSING_DATA.describe()

## Polyline

POLYLINE: (String): It contains a list of GPS coordinates (i.e. WGS84 format) mapped as a string.

The beginning and the end of the string are identified with brackets (i.e. [ and ], respectively).

Each pair of coordinates is also identified by the same brackets as [LONGITUDE, LATITUDE].

This list contains one pair of coordinates for each 15 seconds of trip. 

The last list item corresponds to the trip’s destination while the first one represents its start;

In [None]:
taxi.POLYLINE.describe()

## Trip distance

TRIP_DISTANCE: (float): It contains the total geodesic distance calculated from all consecutive pairs of coordinates.

In [None]:
trip_distance_cleaned = taxi.TRIP_DISTANCE[
    (taxi.TRIP_DISTANCE < taxi.TRIP_DISTANCE.quantile(0.99))
]
trip_distance_cleaned.rename("Trip distance", inplace=True)
ax = sns.violinplot(y=trip_distance_cleaned, cut=0)

## Trip time

TRIP_TIME: (float): It contains the total time calculated from the number of pairs of coordinates, knowing that there is 15 seconds of difference between each pair of coordinates

In [None]:
trip_time_cleaned = taxi.TRIP_TIME[
    (taxi.TRIP_TIME < taxi.TRIP_TIME.quantile(0.99))
]

trip_time_cleaned.rename("Trip time", inplace=True)
ax = sns.violinplot(y=trip_time_cleaned, cut=0)

## Average speed

AVERAGE_SPEED: (float): It contains the average speed calculated from trip distance and trip time.

In [None]:
average_speed_cleaned = taxi.AVERAGE_SPEED[
    (taxi.AVERAGE_SPEED < taxi.AVERAGE_SPEED.quantile(0.99))
]

average_speed_cleaned.rename("Average speed", inplace=True)
ax = sns.violinplot(y=average_speed_cleaned, cut=0)

## Top speed

TOP_SPEED: (float): It contains the top speed calculated from distances and time of all pairs of coordinates.

In [None]:
top_speed_cleaned = taxi.TOP_SPEED[
    (taxi.TOP_SPEED < taxi.TOP_SPEED.quantile(0.99))
]

top_speed_cleaned.rename("Top speed", inplace=True)
ax = sns.violinplot(y=top_speed_cleaned, cut=0)

Impossible speeds are very frequent.

The most probable is that not all pair of coordinates have 15 seconds of difference.

The causes could be:

* Holes exists, then the separation between two consecutive pair of coordinates could be a multiple of 15 seconds. And the speed will be a multiple of the real speed for that section.
* The last pair of coordinates are saved when the trip is finished. Then the difference in time could be between 0 and 15 seconds. And the speed could be between the real speed and infinite.

There's a main peak in 64 km/h and two peaks in 132 km/h (2), 205 km/h (3.2) and 245 km/h (3.8). If they are holes, it's expected that the other peaks will be the convolution of the main peak in other octaves.

The average speed is not specially affected, as is calculated as the total distance and total time, and not as the average of all intermediate speeds.

    TODO: Do some research about the cause of incongruent speeds.

In [None]:
top_speed_cleaned2 = taxi.TOP_SPEED[(taxi.TOP_SPEED < 120)]

top_speed_cleaned2.rename("Top speed", inplace=True)
ax = sns.violinplot(y=top_speed_cleaned2, cut=0)

## Relation between trip distance and trip time

In [None]:
trip_distance_time = taxi[["TRIP_DISTANCE", "TRIP_TIME"]]
trip_distance_time = trip_distance_time[trip_distance_time.TRIP_DISTANCE < 10]
trip_distance_time = trip_distance_time[trip_distance_time.TRIP_TIME < 20]
trip_distance_time.TRIP_DISTANCE.rename("Trip distance [km]", inplace=True)
trip_distance_time.TRIP_TIME.rename("Top Trip time [min]", inplace=True)
ax = sns.jointplot(
    x="TRIP_DISTANCE", y="TRIP_TIME", data=trip_distance_time, kind="kde"
)

## Trip start
TRIP_START is in string format and folium heatmap has [latitude, longitude]. Then variable has to be evaluated and then, flipped.

In [None]:
taxi_start = taxi.TRIP_START.progress_apply(lambda x: eval(x)[::-1])

In [None]:
trip_start_map = folium.Map(location=[41.1579605, -8.629241], zoom_start=12)
plugins.HeatMap(taxi_start, radius=10).add_to(trip_start_map)
trip_start_map