# Station Data Processing
The purpose of this notebook is to clean the Station data set ready for use in the Neo4j database and to ensure it aligns with the data in the Connections dataset.

In [13]:
import pandas as pd
from common_processing import clean_station

In [14]:
df_raw = pd.read_csv(r"../data/raw/stations_raw.csv")
df_raw.head(10)

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Abbey Road,539081,183352,51.531952,0.003723,3,E15 3NB
1,Abbey Wood,547297,179002,51.490784,0.120272,4,SE2 9RH
2,Acton Central,520613,180299,51.508757,-0.26343,2,W3 6BH
3,Acton Main Line,520296,181196,51.516886,-0.26769,3,W3 9EH
4,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN
5,Addington Village,537082,163744,51.356239,-0.032665,3456,CR0 5AR
6,Addiscombe,534190,166290,51.379808,-0.073213,3456,CR0 7AA
7,Albany Park,547903,172902,51.435816,0.126445,5,DA5 3HP
8,Aldgate,533629,181246,51.514342,-0.075627,1,EC3N 1AH
9,Aldgate East,533809,181333,51.515082,-0.073001,1,E1 7PT


In [15]:
# Drop duplicate stations duplicate station names that contain the line they are on
stations_to_drop = {
    "Edgware Road (Bakerloo)",
    "Hammersmith (Met)"
}

mask = ~(df_raw["Station"].isin(stations_to_drop))
df_stations = df_raw[mask]

# Common station processing
df_stations.loc[:, "Station"] = df_stations["Station"].apply(clean_station)

# This needs to be renamed
df_stations.loc[df_stations["Station"] == "St Jamess Park", "Station"] = "St James Park"

# Saving the above for review
df_stations.to_csv(r"../data/raw/station_raw_v2.csv")

At this point we now need to remove any stations that aren't present in the Connections dataset

In [16]:
df_connections = pd.read_csv(r"../data/processed/connections_clean.csv", index_col=0)
df_connections.head()

Unnamed: 0,station_from,station_to,line,direction,distance_kms,un-impeded_running_time,running_time_av
0,Harrow & Wealdstone,Kenton,Bakerloo,Southbound,1.74,2.23,2.5
1,Kenton,South Kenton,Bakerloo,Southbound,1.4,1.88,2.0
2,South Kenton,North Wembley,Bakerloo,Southbound,0.9,1.5,1.5
3,North Wembley,Wembley Central,Bakerloo,Southbound,1.27,1.92,2.06
4,Wembley Central,Stonebridge Park,Bakerloo,Southbound,1.71,2.23,3.13


In [18]:
# Checking if that all stations are in one column of the Connections dataset (see the associated notebook for its processing)
df_station_from_a = set(df_connections["station_from"])
df_station_from_b = set(df_connections["station_to"])
print(df_station_from_a.difference(df_station_from_b))
print(df_station_from_b.difference(df_station_from_a))

set()
set()


In [19]:
# Removing stations from the Stations dataset that aren't in the connections dataset
connection_stations = set(df_connections["Station from (A)"])
mask = df_stations['Station'].isin(connection_stations)
df_valid_stations = df_stations[mask].reset_index(drop=True)

KeyError: 'Station from (A)'

In [20]:
# Checking which stations are in Connections but not the cleaned
connection_stations.difference(df_valid_stations["Station"])

set()

In [21]:
df_valid_stations.head()

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN
1,Aldgate,533629,181246,51.514342,-0.075627,1,EC3N 1AH
2,Aldgate East,533809,181333,51.515082,-0.073001,1,E1 7PT
3,Alperton,518025,183849,51.541209,-0.299516,4,HA0 4LL
4,Amersham,496454,198181,51.674128,-0.606514,9,HP6 5AZ


In [22]:
len(df_valid_stations)

265

In [28]:
# We now need to add the line that each station is on. There should be duplicate station names where that station is on multiple lines.
# We will need to join to the connections dataset to get this information.
df_stations_with_lines = pd.merge(
    left=df_valid_stations,
    right=df_connections,
    left_on="Station",
    right_on="station_from",
    how="inner"
)
df_stations_with_lines = df_stations_with_lines[["Station", "line", "OS X", "OS Y", "Latitude", "Longitude", "Zone", "Postcode"]].drop_duplicates().reset_index(drop=True)
df_stations_with_lines.head()

Unnamed: 0,Station,line,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Acton Town,Piccadilly,519457,179639,51.503071,-0.280303,3,W3 8HN
1,Acton Town,District,519457,179639,51.503071,-0.280303,3,W3 8HN
2,Aldgate,Metropolitan,533629,181246,51.514342,-0.075627,1,EC3N 1AH
3,Aldgate,Circle,533629,181246,51.514342,-0.075627,1,EC3N 1AH
4,Aldgate East,Hammersmith & City,533809,181333,51.515082,-0.073001,1,E1 7PT


In [29]:
len(df_stations_with_lines)

369

In [30]:
# Renaming columns and saving
df_stations_with_lines.columns = ["name", "line", "os_x", "os_y", "latitude", "longitude", "zone", "postcode"]
df_stations_with_lines.to_csv(r"../data/processed/stations_clean.csv")