# Station Data Processing
The purpose of this notebook is to clean the Station data set ready for use in the Neo4j database and to ensure it aligns with the data in the Connections dataset.

In [18]:
import pandas as pd
import numpy as np
from common_processing import clean_station

In [19]:
df_raw = pd.read_csv(r"../data/raw/stations_raw.csv")
df_raw.head(10)

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Abbey Road,539081,183352,51.531952,0.003723,3,E15 3NB
1,Abbey Wood,547297,179002,51.490784,0.120272,4,SE2 9RH
2,Acton Central,520613,180299,51.508757,-0.26343,2,W3 6BH
3,Acton Main Line,520296,181196,51.516886,-0.26769,3,W3 9EH
4,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN
5,Addington Village,537082,163744,51.356239,-0.032665,3456,CR0 5AR
6,Addiscombe,534190,166290,51.379808,-0.073213,3456,CR0 7AA
7,Albany Park,547903,172902,51.435816,0.126445,5,DA5 3HP
8,Aldgate,533629,181246,51.514342,-0.075627,1,EC3N 1AH
9,Aldgate East,533809,181333,51.515082,-0.073001,1,E1 7PT


In [20]:
# Drop duplicate stations (duplicates are on name not line)
stations_to_drop = {
    "Edgware Road (Bakerloo)",
    "Hammersmith (Met)"
}

mask = ~(df_raw["Station"].isin(stations_to_drop))
df_stations = df_raw[mask]

# Common station processing
df_stations.loc[:, "Station"] = df_stations["Station"].apply(clean_station)

# Additional Processing
df_stations.loc[519, "Station"] = "St James Park"

# Saving the above for review
df_stations.to_csv(r"../data/raw/station_raw_v2.csv")

At this point we now need to remove any stations that aren't present in the Connections dataset

In [21]:
# Checking if that all stations are in one column of the Connections dataset
df_connections = pd.read_csv(r"../data/raw/connections_raw_v2.csv")
df_station_from_a = set(df_connections["Station from (A)"])
df_station_from_b = set(df_connections["Station to (B)"])
print(df_station_from_a.difference(df_station_from_b))
print(df_station_from_b.difference(df_station_from_a))

set()
set()


In [22]:
# Removing stations from the Stations dataset that aren't in the connections dataset
connection_stations = set(df_connections["Station from (A)"])
mask = df_stations['Station'].isin(connection_stations)
df_valid_stations = df_stations[mask].reset_index(drop=True)

In [23]:
# Checking which stations are in Connections but not the cleaned
connection_stations.difference(df_valid_stations["Station"])

set()

In [24]:
len(df_valid_stations)

265

In [25]:
# We now need to add the line names (from the connections dataset) to the station names
# (in the stations dataset)
df_stations_with_lines = df_valid_stations.merge(
    right=df_connections[["Line", "Station from (A)"]].drop_duplicates(),
    how="inner",
    left_on="Station",
    right_on="Station from (A)"
)
len(df_stations_with_lines)

369

In [26]:
df_stations_with_lines.head(20)

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode,Line,Station from (A)
0,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN,Piccadilly,Acton Town
1,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN,District,Acton Town
2,Aldgate,533629,181246,51.514342,-0.075627,1,EC3N 1AH,Metropolitan,Aldgate
3,Aldgate,533629,181246,51.514342,-0.075627,1,EC3N 1AH,Circle,Aldgate
4,Aldgate East,533809,181333,51.515082,-0.073001,1,E1 7PT,Hammersmith & City,Aldgate East
5,Aldgate East,533809,181333,51.515082,-0.073001,1,E1 7PT,District,Aldgate East
6,Alperton,518025,183849,51.541209,-0.299516,4,HA0 4LL,Piccadilly,Alperton
7,Amersham,496454,198181,51.674128,-0.606514,9,HP6 5AZ,Metropolitan,Amersham
8,Angel,531497,183263,51.532968,-0.105581,1,N1 8XB,Northern,Angel
9,Archway,529356,186827,51.56549,-0.135122,23,N19 5RQ,Northern,Archway


In [29]:
from common_processing import add_line_to_station

df_station_and_line = df_stations_with_lines.assign(
    Station = df_stations_with_lines.apply(
        lambda row: add_line_to_station(row["Station"], row["Line"]), axis=1
    )
)

df_station_and_line = df_station_and_line.drop(columns=["Line", "Station from (A)"])
df_station_and_line.head()

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Acton Town (Piccadilly),519457,179639,51.503071,-0.280303,3,W3 8HN
1,Acton Town (District),519457,179639,51.503071,-0.280303,3,W3 8HN
2,Aldgate (Metropolitan),533629,181246,51.514342,-0.075627,1,EC3N 1AH
3,Aldgate (Circle),533629,181246,51.514342,-0.075627,1,EC3N 1AH
4,Aldgate East (Hammersmith & City),533809,181333,51.515082,-0.073001,1,E1 7PT


In [30]:
# There's only a few more minor changes to make at this point so I am going to save
# the file and do these manually
df_station_and_line.to_csv(r"../data/processed/stations_clean.csv")