# Station Data Processing
The purpose of this notebook is to clean the Station data set ready for use in the Neo4j database and to ensure it aligns with the data in the Connections dataset.

In [11]:
import pandas as pd
import numpy as np
from common_processing import clean_station

In [12]:
df_raw = pd.read_csv(r"../data/raw/station_data_raw.csv")
df_raw.head(10)

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Abbey Road,539081,183352,51.531952,0.003723,3,E15 3NB
1,Abbey Wood,547297,179002,51.490784,0.120272,4,SE2 9RH
2,Acton Central,520613,180299,51.508757,-0.26343,2,W3 6BH
3,Acton Main Line,520296,181196,51.516886,-0.26769,3,W3 9EH
4,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN
5,Addington Village,537082,163744,51.356239,-0.032665,3456,CR0 5AR
6,Addiscombe,534190,166290,51.379808,-0.073213,3456,CR0 7AA
7,Albany Park,547903,172902,51.435816,0.126445,5,DA5 3HP
8,Aldgate,533629,181246,51.514342,-0.075627,1,EC3N 1AH
9,Aldgate East,533809,181333,51.515082,-0.073001,1,E1 7PT


In [13]:
# Drop duplicate stations (duplicates are on name not line)
stations_to_drop = {
    "Edgware Road (Bakerloo)",
    "Hammersmith (Met)"
}

mask = ~(df_raw["Station"].isin(stations_to_drop))
df_stations = df_raw[mask]

# Common station processing
df_stations.loc[:, "Station"] = df_stations["Station"].apply(clean_station)

# Additional Processing
df_stations.loc[519, "Station"] = "St James Park"

# Saving the above for review
df_stations.to_csv(r"../data/raw/station_data_raw_v2.csv")

In [14]:
# At this point we now need to remove any stations that aren't present in the Connections
# dataset
df_connections = pd.read_csv(r"../data/processed/connections_clean.csv")
connection_stations = set(df_connections["Station from (A)"]).union(set(df_connections["Station to (B)"]))

In [15]:
len(connection_stations)

265

In [16]:
mask = df_stations['Station'].isin(connection_stations)
df_valid_stations = df_stations[mask].reset_index(drop=True)

In [17]:
# There's only a few more minor changes to make at this point so I am going to save
# the file and do these manually
df_valid_stations.to_csv(r"../data/processed/stations_clean.csv")

In [18]:
# Checking which stations are in Connections but not the cleaned
connection_stations.difference(df_valid_stations["Station"])

set()