# Exploratory Data Analysis for Taxi GPS data

### Load the data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import random
import time



In [9]:
# Read into a pandas dataframe
df = pd.read_csv("./data/train.csv", nrows=10000)

In [4]:
converted_df = df.copy()

In [5]:
converted_df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], unit='s')

In [13]:
converted_df['STARTING TIME'] = converted_df['TIMESTAMP'].dt.time

In [15]:
converted_df['TRAVEL_TIME'] = converted_df['POLYLINE'].apply(
            lambda polyline: (len(polyline) - 1) * 15
        )

In [17]:
converted_df['END_TIME'] = converted_df['TIMESTAMP'] + pd.to_timedelta(converted_df['TRAVEL_TIME'], unit='s')

In [None]:
converted_df

In [10]:
import ast
df_converted = df.copy()
df_converted["POLYLINE"] = df_converted['POLYLINE'].apply(ast.literal_eval)
df_converted["START"] = [
            poly[0] if isinstance(poly, list) and len(poly) > 0 else None
            for poly in df["POLYLINE"]
        ]

In [15]:
exploded_df = df_converted.explode("POLYLINE").reset_index(drop=True)

In [16]:
exploded_df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,START
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[-8.618643, 41.141412]",
1,1372636858620000589,C,,,20000589,1372636858,A,False,"[-8.618499, 41.141376]",
2,1372636858620000589,C,,,20000589,1372636858,A,False,"[-8.620326, 41.14251]",
3,1372636858620000589,C,,,20000589,1372636858,A,False,"[-8.622153, 41.143815]",
4,1372636858620000589,C,,,20000589,1372636858,A,False,"[-8.623953, 41.144373]",


In [None]:
import matplotlib.pyplot as plt

# Drop rows with missing coordinates to avoid plotting issues
df_plot = df.dropna(subset=['First_Longitude', 'First_Latitude', 'Last_Longitude', 'Last_Latitude'])

# Plotting
plt.figure(figsize=(10, 6))

# Plot first coordinates
plt.scatter(df_plot['First_Longitude'], df_plot['First_Latitude'], color='green', label='Start')

# Plot last coordinates
plt.scatter(df_plot['Last_Longitude'], df_plot['Last_Latitude'], color='red', label='End')

# Connect start and end points
for _, row in df_plot.iterrows():
    plt.plot([row['First_Longitude'], row['Last_Longitude']], 
             [row['First_Latitude'], row['Last_Latitude']], 
             color='blue', linestyle='--', linewidth=1)

plt.title('First and Last Coordinates of Polylines')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.grid(True)
plt.show()