# Interchange Data Processing
The purpose of this notebook is to process the interchange data into a format that can be uploaded to the Neo4j database.

In [36]:
import pandas as pd
import numpy as np

In [6]:
df_raw = pd.read_csv(r"../data/raw/interchange_data_raw.csv", header=0, encoding="windows-1252")
df_raw.head(10)

Unnamed: 0,STATION NAME:,"DETAILS OF MAXIMUM INTERCHANGE VALUES BETWEEN LINES (where appropriate) – all values apply in both directions. However, in some cases there may be lower values according to station layout."
0,Abbey Road,
1,Acton Central,
2,Acton Town,District <> Piccadilly line: 2 minutes
3,Aldgate,Connections between Circle and Metropolitan li...
4,Aldgate East,Connections between District and Hammersmith &...
5,All Saints,
6,Alperton,
7,Amersham,Connections between Metropolitan and Chiltern ...
8,Anerley,Connections between London Overground and Sout...
9,,


In [7]:
new_col_names = ['station', 'interchange']
df_raw.columns = new_col_names
df_raw.head(5)

Unnamed: 0,station,interchange
0,Abbey Road,
1,Acton Central,
2,Acton Town,District <> Piccadilly line: 2 minutes
3,Aldgate,Connections between Circle and Metropolitan li...
4,Aldgate East,Connections between District and Hammersmith &...


In [19]:
df_no_nan_rows = df_raw.dropna(axis=0, how='any').reset_index(drop=True)
df_no_nan_rows.head(10)

Unnamed: 0,station,interchange
0,Acton Town,District <> Piccadilly line: 2 minutes
1,Aldgate,Connections between Circle and Metropolitan li...
2,Aldgate East,Connections between District and Hammersmith &...
3,Amersham,Connections between Metropolitan and Chiltern ...
4,Anerley,Connections between London Overground and Sout...
5,Baker Street,Bakerloo to Circle: 8 minutes
6,Baker Street,Bakerloo to Hammersmith & City: 8 minutes
7,Baker Street,Bakerloo to Jubilee: 2 minutes
8,Baker Street,Bakerloo to Metropolitan: 3 minutes
9,Baker Street,Circle to Hammersmith & City: 2 minutes


In [16]:
# Writing to file so I can review
df_no_nan_rows.to_csv(r"../data/raw/interchange_data_raw_v2.csv", encoding="UTF-8")

In [21]:
df_interchange_data = df_no_nan_rows.copy(deep=True)
df_interchange_data

Unnamed: 0,station,interchange
0,Acton Town,District <> Piccadilly line: 2 minutes
1,Aldgate,Connections between Circle and Metropolitan li...
2,Aldgate East,Connections between District and Hammersmith &...
3,Amersham,Connections between Metropolitan and Chiltern ...
4,Anerley,Connections between London Overground and Sout...
...,...,...
338,Wimbledon,District to South West Trains: 5 minutes
339,Wimbledon,District to Tramlink: 4 minutes
340,Wimbledon,South West Trains to Tramlink: 5 minutes
341,Woodford,Connections between different branches: 3 minutes


In [38]:
def retrieve_line_from(x):
    idx = x.find(" to ")
    if idx == -1:
        return np.nan
    else:
        return x[:idx]   
    
df_interchange_data['line_from'] = df_interchange_data['interchange'].apply(retrieve_line_from)
df_interchange_data.head(10)

Unnamed: 0,station,interchange,line_from
0,Acton Town,District <> Piccadilly line: 2 minutes,
1,Aldgate,Connections between Circle and Metropolitan li...,
2,Aldgate East,Connections between District and Hammersmith &...,
3,Amersham,Connections between Metropolitan and Chiltern ...,
4,Anerley,Connections between London Overground and Sout...,
5,Baker Street,Bakerloo to Circle: 8 minutes,Bakerloo
6,Baker Street,Bakerloo to Hammersmith & City: 8 minutes,Bakerloo
7,Baker Street,Bakerloo to Jubilee: 2 minutes,Bakerloo
8,Baker Street,Bakerloo to Metropolitan: 3 minutes,Bakerloo
9,Baker Street,Circle to Hammersmith & City: 2 minutes,Circle


In [42]:
def retrieve_line_to(x):
    idx_to = x.find(" to ")
    idx_colon = x.find(":")
    if (idx_to == -1) or (idx_colon == -1):
        return None
    else:
        return x[(idx_to+4):idx_colon]   
    
df_interchange_data['line_to'] = df_interchange_data['interchange'].apply(retrieve_line_to)

In [43]:
df_interchange_data.head(10)

Unnamed: 0,station,interchange,line_from,line_to
0,Acton Town,District <> Piccadilly line: 2 minutes,,
1,Aldgate,Connections between Circle and Metropolitan li...,,
2,Aldgate East,Connections between District and Hammersmith &...,,
3,Amersham,Connections between Metropolitan and Chiltern ...,,
4,Anerley,Connections between London Overground and Sout...,,
5,Baker Street,Bakerloo to Circle: 8 minutes,Bakerloo,Circle
6,Baker Street,Bakerloo to Hammersmith & City: 8 minutes,Bakerloo,Hammersmith & City
7,Baker Street,Bakerloo to Jubilee: 2 minutes,Bakerloo,Jubilee
8,Baker Street,Bakerloo to Metropolitan: 3 minutes,Bakerloo,Metropolitan
9,Baker Street,Circle to Hammersmith & City: 2 minutes,Circle,Hammersmith & City


In [53]:
def retrieve_duration(x):
    idx_colon = x.find(":")
    idx_hyphen = x.find("-")
    idx_mins = x.find(" minutes")
    if (idx_colon == -1) or (idx_mins == -1):
        return None
    else:
        return x[(idx_colon+1):idx_mins]   
    
df_interchange_data['duration_mins'] = df_interchange_data['interchange'].apply(retrieve_duration)

In [54]:
df_interchange_data.head(10)

Unnamed: 0,station,interchange,line_from,line_to,duration_mins
0,Acton Town,District <> Piccadilly line: 2 minutes,,,2
1,Aldgate,Connections between Circle and Metropolitan li...,,,4
2,Aldgate East,Connections between District and Hammersmith &...,,,2
3,Amersham,Connections between Metropolitan and Chiltern ...,,,3
4,Anerley,Connections between London Overground and Sout...,,,4
5,Baker Street,Bakerloo to Circle: 8 minutes,Bakerloo,Circle,8
6,Baker Street,Bakerloo to Hammersmith & City: 8 minutes,Bakerloo,Hammersmith & City,8
7,Baker Street,Bakerloo to Jubilee: 2 minutes,Bakerloo,Jubilee,2
8,Baker Street,Bakerloo to Metropolitan: 3 minutes,Bakerloo,Metropolitan,3
9,Baker Street,Circle to Hammersmith & City: 2 minutes,Circle,Hammersmith & City,2
