<img src="https://imgur.com/VXlJ6iZ.jpg">

In [None]:
!pip install -q gmaps

In [None]:
import numpy as np
import pandas as pd
import gmaps
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import seaborn as sns
from math import radians, sin, cos, asin, sqrt
from private_data import gmap_api_key
import warnings
warnings.filterwarnings('ignore')

<div style="font-size:25pt; color:royalblue; font-variant:small-caps; font-weight:700">Importing data</div>

In [None]:
data = pd.read_csv("/kaggle/input/shinkansen-stations-in-japan/Shinkansen_stations_inJapan.csv")

<div style="font-size:25pt; color:royalblue; font-variant:small-caps; font-weight:700">Functions retrieving location</div>

In [None]:
def get_location(address):
    geolocator = Nominatim(user_agent="null")
    location = geolocator.geocode(address)
    if location:
        return [location.address, location.latitude, location.longitude]
    else:
        return None

<div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">The next function is tweaked so it can get the correct address each time ..</div>

In [None]:
def add_coordinates(df):
    locs = []
    for k in range(df["Station Name"].shape[0]):
        if get_location(df["Station Name"].iloc[k] + " station, " + df["Prefecture"].iloc[k] + ", japan") == None:
            locs.append(get_location(df["Station Name"].iloc[k] + " station"))
        else:
            locs.append(get_location(df["Station Name"].iloc[k] + " station, " + df["Prefecture"].iloc[k] + ", japan"))
    locs = np.array(locs)
    df["Location retrieved at"] = locs[:,0]
    df["Latitude"] = locs[:,1]
    df["Longitude"] = locs[:,2]
    return df

<div style="font-size:25pt; color:royalblue; font-variant:small-caps; font-weight:700">Data cleaning</div><br><div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">Again we need to change manually several locations so that we get the correct address coordinates</div>

In [None]:
data.Shinkansen_Line = data.Shinkansen_Line.apply(lambda x: x.replace("_Shinkansen", ""))
data.Shinkansen_Line = data.Shinkansen_Line.apply(lambda x: x.replace("-Shinknsen", ""))
data.Company = data.Company.apply(lambda x: x.replace("_", " "))
data.rename(columns={"Station_Name": "Station Name",
                     "Shinkansen_Line": "Shinkansen Line",
                     "Distance from Tokyo st": "Distance from Tokyo station [km]"}, inplace=True)
data['Station Name'] = data['Station Name'].replace(['Kurobe-Unazukionsen'],'Shin-kurobe')
data.loc[87, "Station Name"] = '飯山駅'

<div style="font-size:25pt; color:royalblue; font-variant:small-caps; font-weight:700">Adding coordinates</div>

In [None]:
data = add_coordinates(data)

<div style="font-size:14pt; color:royalblue; font-variant:small-caps; font-weight:700">There are stations that belongs to two different Shinkansen lines</div><br><div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">We are here separating them</div>

In [None]:
data_A = data[~data["Shinkansen Line"].str.contains(",")].copy()

data_doubles = data[data["Shinkansen Line"].str.contains(",")].reset_index().drop(["index"], axis=1).copy()

double_lines_series_1 = []
double_lines_series_2 = []

for k in data_doubles["Shinkansen Line"]:
    double_lines_series_1.append(k[k.find(",")+1:])
    double_lines_series_2.append(k[:k.find(",")])
    
double_lines_series_1 = pd.Series(double_lines_series_1)
double_lines_series_2 = pd.Series(double_lines_series_2)

data_doubles1 = data_doubles.copy()
data_doubles2 = data_doubles.copy()

data_doubles1["Shinkansen Line"] = double_lines_series_1
data_doubles2["Shinkansen Line"] = double_lines_series_2

data_B = pd.concat([data_doubles1, data_doubles2], axis=0, ignore_index=True)

data = pd.concat([data_A, data_B], axis=0, ignore_index=True)

<div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">The following function will define segments corresponding to the path between two train station</div>

In [None]:
def gmaps_segments(df, color):
    segments = []
    for k in range(df.shape[0]-1):
        segments.append(
            gmaps.Line(
                start=(float(df["Latitude"].iloc[k]), float(df["Longitude"].iloc[k])),
                end=(float(df["Latitude"].iloc[k+1]), float(df["Longitude"].iloc[k+1])),
                stroke_weight=3.0,
                stroke_color=color,
                stroke_opacity=1
                      )
                               )
    return segments

<div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">Here are defined the shinkansen lines colors</div>

In [None]:
shinkansen_lines = dict()
line_colors = {"Tokaido": '#0068B6',
               "Sanyo": '#0B416A',
               "Tohoku": '#059646',
               "Joetsu": '#38614A',
               "Yamagata": '#66B68A',
               "Akita": '#99CFB1',
               "Hokuriku": '#BC5127',
               "Kyushu": '#FF1E23',
               "Hokkaido": '#8DC21F'}

<div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">Here we create the segments to be plotted on the map</div>

In [None]:
shinkansen_lines = dict()
for line in data["Shinkansen Line"].unique():
    seg = data[data["Shinkansen Line"]==line].sort_values(by="Distance from Tokyo station [km]", ascending=True).copy()
    shinkansen_lines[line] = gmaps_segments(seg, line_colors[line])

<div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">This function will add segments to the map</div>

In [None]:
def add_gmaps_layer(figure, segments):
    for k, v in segments.items():
        figure.add_layer(gmaps.drawing_layer(features=v))
    return

<div style="font-size:25pt; color:royalblue; font-variant:small-caps; font-weight:700">The Shinkansen lines path on the map</div><br><div style="font-size:14pt; color:black; font-variant:small-caps; font-weight:100">Sadly the google maps do not show on Kaggle notebook (or I didn't find the way to do it), thus this is a simple image but it will work fine if you download the notebook and run it on a Jupyter Notebook (not Jupyter lab)</div>

In [None]:
gmaps.configure(api_key=gmap_api_key)
layout={'width': '900px',
        'height': '900px',
        'padding': '3px',
        'margin': "0px 0px 0px 50px",
        'border': '0px solid black'}
fig = gmaps.figure(map_type="TERRAIN", center=(39, 138), zoom_level=6, layout=layout, display_toolbar=False)
add_gmaps_layer(fig, shinkansen_lines)
fig

In [None]:
def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Compute distance between two pairs of (lat, lng)
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    return 2 * 6371 * asin(sqrt(a))

In [None]:
lines_sizes = dict()

for line in list(data["Shinkansen Line"].unique()):
    data_t = data[data["Shinkansen Line"] == line].sort_values(by="Distance from Tokyo station [km]").reset_index().copy()
    dists = 0
    for k in range(len(data_t)-1):
        d = haversine_distance(float(data_t.loc[k, 'Longitude']), 
                               float(data_t.loc[k, 'Latitude']),
                               float(data_t.loc[(k+1), 'Longitude']),
                               float(data_t.loc[(k+1), 'Latitude']))
        dists += d
    lines_sizes[line] = dists

In [None]:
line_km = pd.DataFrame(lines_sizes, index=[0]).T.reset_index().rename(columns={"index": "Shinkansen Line", 0: "Size [km]"})
line_km = line_km.sort_values(by="Size [km]", ascending=True)
line_km["Line color"] = line_km["Shinkansen Line"].map(line_colors)

In [None]:
fig = plt.figure(figsize=(12,6))
clrs = line_colors.values()
ax = sns.barplot(y = "Shinkansen Line", x="Size [km]", data=line_km, palette=list(line_km["Line color"]), orient="h");
ax.tick_params(axis='both', which='major', labelsize=15, labelrotation=12.)
plt.rc('axes', labelsize=20)
plt.xlim(0, 750)
plt.grid()
plt.title("Shinkansen lines length [km]", fontdict={"fontsize":20}, pad=30)
ax.set_axisbelow(True)