# Connections Data Processing
The purpose of this notebook is to process and clean connections data to prepare it for the Neo4j database. 

It will also be used as a reference set for the other datasets in that:
* We only want Station data that has corresponding connection data.
* We only want Iterchange data that has corresponding Station data.

In [2]:
import pandas as pd
import numpy as np
from common_processing import clean_station

In [3]:
df_connections_raw = pd.read_csv(r"../data/raw/connections_raw.csv")
df_connections_raw.head(10)

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),AM peak (0700-1000) Running Time (Mins),Inter peak (1000 - 1600) Running time (mins)
0,Bakerloo,Southbound,HARROW & WEALDSTONE,KENTON,1.74,2.23,2.5,2.5
1,Bakerloo,Southbound,KENTON,SOUTH KENTON,1.4,1.88,2.0,2.0
2,Bakerloo,Southbound,SOUTH KENTON,NORTH WEMBLEY,0.9,1.5,1.5,1.5
3,Bakerloo,Southbound,NORTH WEMBLEY,WEMBLEY CENTRAL,1.27,1.92,2.06,2.06
4,Bakerloo,Southbound,WEMBLEY CENTRAL,STONEBRIDGE PARK,1.71,2.23,3.13,3.13
5,Bakerloo,Southbound,STONEBRIDGE PARK,HARLESDEN,1.53,2.13,2.4,2.4
6,Bakerloo,Southbound,HARLESDEN,WILLESDEN JUNCTION,1.05,1.65,2.23,2.23
7,Bakerloo,Southbound,WILLESDEN JUNCTION,KENSAL GREEN,1.5,2.47,2.5,2.5
8,Bakerloo,Southbound,KENSAL GREEN,QUEENS PARK,1.32,2.65,4.72,4.72
9,Bakerloo,Southbound,QUEENS PARK,KILBURN PARK,0.79,1.42,1.5,1.5


In [4]:
# Taking the average of the am_peak and inter_peak running times
running_time_av = round((df_connections_raw["AM peak (0700-1000) Running Time (Mins)"] \
    + df_connections_raw["Inter peak (1000 - 1600) Running time (mins)"])/2, 2)

df_connections_raw["running_time_av"] = running_time_av
    
df_connections_raw.head()

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),AM peak (0700-1000) Running Time (Mins),Inter peak (1000 - 1600) Running time (mins),running_time_av
0,Bakerloo,Southbound,HARROW & WEALDSTONE,KENTON,1.74,2.23,2.5,2.5,2.5
1,Bakerloo,Southbound,KENTON,SOUTH KENTON,1.4,1.88,2.0,2.0,2.0
2,Bakerloo,Southbound,SOUTH KENTON,NORTH WEMBLEY,0.9,1.5,1.5,1.5,1.5
3,Bakerloo,Southbound,NORTH WEMBLEY,WEMBLEY CENTRAL,1.27,1.92,2.06,2.06,2.06
4,Bakerloo,Southbound,WEMBLEY CENTRAL,STONEBRIDGE PARK,1.71,2.23,3.13,3.13,3.13


In [5]:
# Removing trailing whitespace from Line column (see csv)
df_connections_clean = df_connections_raw[["Line", "Direction", "Station from (A)", "Station to (B)", "Distance (Kms)", "Un-impeded Running Time (Mins)", "running_time_av"]]
df_connections_clean = df_connections_clean.assign(Line=df_connections_raw["Line"].apply(lambda x: x.strip()))

# Correcting Hammersmith & City line
df_connections_clean["Line"] = df_connections_clean["Line"].apply(
    lambda x: "Hammersmith & City" if x == "H & C" else x
)

# Removing "East London" i.e. Overground entries as we won't be including these
df_connections_clean = df_connections_clean[df_connections_clean["Line"] != "East London"]

df_connections_clean.head()

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),running_time_av
0,Bakerloo,Southbound,HARROW & WEALDSTONE,KENTON,1.74,2.23,2.5
1,Bakerloo,Southbound,KENTON,SOUTH KENTON,1.4,1.88,2.0
2,Bakerloo,Southbound,SOUTH KENTON,NORTH WEMBLEY,0.9,1.5,1.5
3,Bakerloo,Southbound,NORTH WEMBLEY,WEMBLEY CENTRAL,1.27,1.92,2.06
4,Bakerloo,Southbound,WEMBLEY CENTRAL,STONEBRIDGE PARK,1.71,2.23,3.13


In [6]:
# Common Station processing
df_connections_clean['Station from (A)'] = df_connections_clean['Station from (A)'].apply(clean_station)
df_connections_clean['Station to (B)'] = df_connections_clean['Station to (B)'].apply(clean_station)

# Converting Station names to remove text in 
def clean_connection_station(x):
    idx = x.find(" (")
    station = x if idx == -1 else x[:idx]
    station = "Walthamstow Central" if station == "Walthamstow" else station
    station = "Highbury & Islington" if station == "Highbury" else station
    station = "Heathrow Terminals 1 2 3" if station == "Heathrow 123" else station
    station = "Heathrow Terminal 4" if station == "Heathrow Terminal Four" else station
    return station

df_connections_clean['Station from (A)'] = df_connections_clean['Station from (A)'].apply(clean_connection_station)
df_connections_clean['Station to (B)'] = df_connections_clean['Station to (B)'].apply(clean_connection_station)

In [7]:
df_connections_clean.head()

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),running_time_av
0,Bakerloo,Southbound,Harrow & Wealdstone,Kenton,1.74,2.23,2.5
1,Bakerloo,Southbound,Kenton,South Kenton,1.4,1.88,2.0
2,Bakerloo,Southbound,South Kenton,North Wembley,0.9,1.5,1.5
3,Bakerloo,Southbound,North Wembley,Wembley Central,1.27,1.92,2.06
4,Bakerloo,Southbound,Wembley Central,Stonebridge Park,1.71,2.23,3.13


In [8]:
df_connections_clean = df_connections_clean.rename(columns={
    "Station from (A)": "station_from",
    "Station to (B)": "station_to",
    "Line": "line",
    "Direction": "direction",
    "Distance (Kms)": "distance_kms",
    "Un-impeded Running Time (Mins)": "un-impeded_running_time"
})
df_connections_clean.head()

Unnamed: 0,line,direction,station_from,station_to,distance_kms,un-impeded_running_time,running_time_av
0,Bakerloo,Southbound,Harrow & Wealdstone,Kenton,1.74,2.23,2.5
1,Bakerloo,Southbound,Kenton,South Kenton,1.4,1.88,2.0
2,Bakerloo,Southbound,South Kenton,North Wembley,0.9,1.5,1.5
3,Bakerloo,Southbound,North Wembley,Wembley Central,1.27,1.92,2.06
4,Bakerloo,Southbound,Wembley Central,Stonebridge Park,1.71,2.23,3.13


In [9]:
df_connections_clean = df_connections_clean[["station_from", "station_to", "line", "direction", "distance_kms", "un-impeded_running_time", "running_time_av"]]
print(len(df_connections_clean))
print(len(df_connections_clean.drop_duplicates()))

727
727


In [10]:
# Saving the cleaned dataset
df_connections_clean.to_csv(r"../data/processed/connections_clean.csv")