# Connections Data Processing
The purpose of this notebook is to process and clean connections data to prepare it for the Neo4j database. 

It will also be used as a reference set for the other datasets in that:
* We only want Station data that has corresponding connection data.
* We only want Iterchange data that has corresponding Station data.

In [1]:
import pandas as pd
import numpy as np
from common_processing import clean_station

In [2]:
df_raw = pd.read_csv(r"../data/raw/connections_raw.csv")
df_raw.head(10)

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),AM peak (0700-1000) Running Time (Mins),Inter peak (1000 - 1600) Running time (mins)
0,Bakerloo,Southbound,HARROW & WEALDSTONE,KENTON,1.74,2.23,2.5,2.5
1,Bakerloo,Southbound,KENTON,SOUTH KENTON,1.4,1.88,2.0,2.0
2,Bakerloo,Southbound,SOUTH KENTON,NORTH WEMBLEY,0.9,1.5,1.5,1.5
3,Bakerloo,Southbound,NORTH WEMBLEY,WEMBLEY CENTRAL,1.27,1.92,2.06,2.06
4,Bakerloo,Southbound,WEMBLEY CENTRAL,STONEBRIDGE PARK,1.71,2.23,3.13,3.13
5,Bakerloo,Southbound,STONEBRIDGE PARK,HARLESDEN,1.53,2.13,2.4,2.4
6,Bakerloo,Southbound,HARLESDEN,WILLESDEN JUNCTION,1.05,1.65,2.23,2.23
7,Bakerloo,Southbound,WILLESDEN JUNCTION,KENSAL GREEN,1.5,2.47,2.5,2.5
8,Bakerloo,Southbound,KENSAL GREEN,QUEENS PARK,1.32,2.65,4.72,4.72
9,Bakerloo,Southbound,QUEENS PARK,KILBURN PARK,0.79,1.42,1.5,1.5


In [3]:
# Taking the average of the am_peak and inter_peak running times
running_time_av = round((df_raw["AM peak (0700-1000) Running Time (Mins)"] \
    + df_raw["Inter peak (1000 - 1600) Running time (mins)"])/2, 2)

df_raw["running_time_av"] = running_time_av
    
df_raw.head()

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),AM peak (0700-1000) Running Time (Mins),Inter peak (1000 - 1600) Running time (mins),running_time_av
0,Bakerloo,Southbound,HARROW & WEALDSTONE,KENTON,1.74,2.23,2.5,2.5,2.5
1,Bakerloo,Southbound,KENTON,SOUTH KENTON,1.4,1.88,2.0,2.0,2.0
2,Bakerloo,Southbound,SOUTH KENTON,NORTH WEMBLEY,0.9,1.5,1.5,1.5,1.5
3,Bakerloo,Southbound,NORTH WEMBLEY,WEMBLEY CENTRAL,1.27,1.92,2.06,2.06,2.06
4,Bakerloo,Southbound,WEMBLEY CENTRAL,STONEBRIDGE PARK,1.71,2.23,3.13,3.13,3.13


In [4]:
# Removing trailing whitespace from Line column (see csv)
df_clean = df_raw[["Line", "Direction", "Station from (A)", "Station to (B)", "Distance (Kms)", "Un-impeded Running Time (Mins)", "running_time_av"]]
df_clean = df_clean.assign(Line=df_raw["Line"].apply(lambda x: x.strip()))

# Correcting Hammersmith & City line
df_clean["Line"] = df_clean["Line"].apply(
    lambda x: "Hammersmith & City" if x == "H & C" else x
)

# Removing "East London" i.e. Overground entries as we won't be including these
df_clean = df_clean[df_clean["Line"] != "East London"]

df_clean.head()

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),running_time_av
0,Bakerloo,Southbound,HARROW & WEALDSTONE,KENTON,1.74,2.23,2.5
1,Bakerloo,Southbound,KENTON,SOUTH KENTON,1.4,1.88,2.0
2,Bakerloo,Southbound,SOUTH KENTON,NORTH WEMBLEY,0.9,1.5,1.5
3,Bakerloo,Southbound,NORTH WEMBLEY,WEMBLEY CENTRAL,1.27,1.92,2.06
4,Bakerloo,Southbound,WEMBLEY CENTRAL,STONEBRIDGE PARK,1.71,2.23,3.13


In [5]:
# Common Station processing
df_clean['Station from (A)'] = df_clean['Station from (A)'].apply(clean_station)
df_clean['Station to (B)'] = df_clean['Station to (B)'].apply(clean_station)

# Converting Station names to remove text in 
def clean_connection_station(x):
    idx = x.find(" (")
    station = x if idx == -1 else x[:idx]
    station = "Walthamstow Central" if station == "Walthamstow" else station
    station = "Highbury & Islington" if station == "Highbury" else station
    station = "Heathrow Terminals 1 2 3" if station == "Heathrow 123" else station
    station = "Heathrow Terminal 4" if station == "Heathrow Terminal Four" else station
    return station

df_clean['Station from (A)'] = df_clean['Station from (A)'].apply(clean_connection_station)
df_clean['Station to (B)'] = df_clean['Station to (B)'].apply(clean_connection_station)

In [6]:
df_clean.head()

Unnamed: 0,Line,Direction,Station from (A),Station to (B),Distance (Kms),Un-impeded Running Time (Mins),running_time_av
0,Bakerloo,Southbound,Harrow & Wealdstone,Kenton,1.74,2.23,2.5
1,Bakerloo,Southbound,Kenton,South Kenton,1.4,1.88,2.0
2,Bakerloo,Southbound,South Kenton,North Wembley,0.9,1.5,1.5
3,Bakerloo,Southbound,North Wembley,Wembley Central,1.27,1.92,2.06
4,Bakerloo,Southbound,Wembley Central,Stonebridge Park,1.71,2.23,3.13


Adding the line name to the station names (this has to be done after the stations_processing notebook)

In [8]:
from common_processing import add_line_to_station

df_station_formatted = df_clean.assign(
    station_from = df_clean.apply(
        lambda row: add_line_to_station(row["Station from (A)"], row["Line"]),
        axis=1
    ),
    station_to = df_clean.apply(
        lambda row: add_line_to_station(row["Station to (B)"], row["Line"]),
        axis=1
    )
)

df_station_formatted = df_station_formatted[["station_from", "station_to", "Line", "Direction", "Distance (Kms)", "Un-impeded Running Time (Mins)", "running_time_av"]]
df_station_formatted.columns = ["station_from", "station_to", "line", "direction", "distance_km", "running_time_unimpeded", "running_time_av"]
df_station_formatted.head()

Unnamed: 0,station_from,station_to,line,direction,distance_km,running_time_unimpeded,running_time_av
0,Harrow & Wealdstone (Bakerloo),Kenton (Bakerloo),Bakerloo,Southbound,1.74,2.23,2.5
1,Kenton (Bakerloo),South Kenton (Bakerloo),Bakerloo,Southbound,1.4,1.88,2.0
2,South Kenton (Bakerloo),North Wembley (Bakerloo),Bakerloo,Southbound,0.9,1.5,1.5
3,North Wembley (Bakerloo),Wembley Central (Bakerloo),Bakerloo,Southbound,1.27,1.92,2.06
4,Wembley Central (Bakerloo),Stonebridge Park (Bakerloo),Bakerloo,Southbound,1.71,2.23,3.13


In [9]:
# Saving this to raw as more processing will be required to add line names to station names
df_station_formatted.to_csv(r"../data/processed/connections_clean.csv")