# Cultural Scorecard in NYC based on the MTA

In [1]:
# start a Spark session
import pyspark

session = pyspark.sql.SparkSession
# use all available memory for the driver
spark = session.builder.config('spark.driver.memory','10g').getOrCreate()
spark

In [2]:
# Set dataset paths
path = "Final_Data/final_joined_data.csv"

# Function to read in CSV files
def read_data(path):
    df = spark.read\
              .option("header", True)\
              .csv(path)
    df.printSchema()
    return df

In [3]:
final_df = read_data(path)

root
 |-- station_complex: string (nullable = true)
 |-- station: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- CBD: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- str_Coordinates: string (nullable = true)
 |-- cultural_landmark: string (nullable = true)
 |-- historical_place: string (nullable = true)
 |-- monument: string (nullable = true)
 |-- museum: string (nullable = true)
 |-- performing_arts_theater: string (nullable = true)
 |-- sculpture: string (nullable = true)
 |-- art_gallery: string (nullable = true)
 |-- art_studio: string (nullable = true)
 |-- auditorium: string (nullable = true)



In [4]:
final_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,cultural_landmark,historical_place,monument,museum,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0


In [5]:
from pyspark.sql.types import DoubleType, IntegerType
## Convert columns to Double
final_df = final_df.withColumn("total_ridership", final_df["total_ridership"].cast(DoubleType()))\
                   .withColumn("art_gallery", final_df["art_gallery"].cast(DoubleType()))\
                   .withColumn("art_studio", final_df["art_studio"].cast(DoubleType()))\
                   .withColumn("auditorium", final_df["auditorium"].cast(DoubleType()))\
                   .withColumn("cultural_landmark", final_df["cultural_landmark"].cast(DoubleType()))\
                   .withColumn("historical_place", final_df["historical_place"].cast(DoubleType()))\
                   .withColumn("monument", final_df["monument"].cast(DoubleType()))\
                   .withColumn("museum", final_df["museum"].cast(DoubleType()))\
                   .withColumn("performing_arts_theater", final_df["performing_arts_theater"].cast(DoubleType()))\
                   .withColumn("sculpture", final_df["sculpture"].cast(DoubleType()))

## Basic Visualizations

In [6]:
import ipywidgets

unique_complex_day = final_df.dropDuplicates(["station_complex", "transit_date"])
unique_complex = final_df.dropDuplicates(["station_complex"])

In [7]:
from pyspark import pandas as ps
unique_complex_day_pd = unique_complex_day.toPandas()



In [8]:
import plotly.io as pio
import plotly.express as px
pio.renderers.default = 'iframe'

In [9]:
fig = px.histogram(unique_complex_day_pd, x="total_ridership", 
                   title="Histogram of Total Ridership per Station Complex (2025)", 
                   labels={
                     "total_ridership": "Total Daily Ridership"
                     })
fig.show()

In [10]:
!pip install --upgrade kaleido



In [11]:
columns = ["art_gallery", "art_studio", "auditorium", "cultural_landmark", "historical_place", 
           "monument", "museum", "performing_arts_theater", "sculpture"]

fig2 = px.box(unique_complex[columns], title="Box Plots of Distribution of Nearby Places by Type", 
                   labels={
                     "variable": "Types of Places"
                     })

fig2.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = columns,
        ticktext = ["Art Gallery", "Art Studio", "Auditorium", "Cultural Landmark", "Historical Place",
                    "Monument", "Museum", "Performing Arts Theater", "Sculpture"]
    )
)
fig2.show()

## Add Crowdedness Score

In [12]:
# Take log of ridership to see if it normalizes
from pyspark.sql.functions import log
final_df = final_df.withColumn("log(ridership)", log(final_df.total_ridership))

In [13]:
final_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,cultural_landmark,historical_place,monument,museum,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium,log(ridership)
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.745003
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.745003
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.956477
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.956477
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0,8.140316


In [14]:
fig3 = px.histogram(final_df, x="log(ridership)", 
                   title="Histogram of Log(Total Ridership per Station Complex) (2025)", 
                   labels={
                     "log(ridership)": "Log(Total Daily Ridership)"
                     })
fig3.show()

In [15]:
# Source: https://stackoverflow.com/questions/68496993/probnorm-function-equivalent-in-pyspark
import pandas as pd
from scipy.stats import norm
from pyspark.sql.functions import udf, col, lit, min, max

# unique_complex_day = final_df.dropDuplicates(["station_complex", "transit_date"])
# unique_complex = final_df.dropDuplicates(["station_complex"])

drop_empty = final_df.filter(col("log(ridership)") >= 6.0) ## Drop extreme values

In [16]:
unique_complex_day.count()

8064

In [17]:
drop_empty.count()

13298

In [18]:
# Calculate Min and Max for Log(ridership)
rides_min = drop_empty.select(min(col("log(ridership)"))).collect()[0][0]
rides_max = drop_empty.select(max(col("log(ridership)"))).collect()[0][0]

# Perform Normalization
drop_empty = drop_empty.withColumn("pnorm(ridership)", (col("log(ridership)") - rides_min) / (rides_max - rides_min))

In [19]:
fig3 = px.histogram(drop_empty, x="pnorm(ridership)", 
                   title="Histogram of Probability Normalized (Total Ridership per Station Complex) (2025)", 
                   labels={
                     "pnorm(ridership)": "PNorm(Total Daily Ridership)"
                     })
fig3.show()

In [20]:
# Add crowdedness
score_df = drop_empty.withColumn("crowd_score", 10*(1 - col("pnorm(ridership)")))

In [21]:
fig3 = px.histogram(score_df, x="crowd_score", 
                   title="Histogram of Crowdedness Score (2025)", 
                   labels={
                     "crowd_score": "Crowdedness Score (0-10)"
                     })
fig3.show()

In [22]:
score_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,...,monument,museum,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium,log(ridership),pnorm(ridership),crowd_score
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,1.0,2.0,5.0,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",...,1.0,2.0,5.0,3.0,5.0,2.0,0.0,8.140316,0.354277,6.457229


## Add Quantity Score

In [23]:
#                           Weight
# art_gallery	            0.2
# art_studio	            0.1
# auditorium	            0.025
# cultural_landmark	        0.15
# historical_place	        0.05
# monument	                0.1
# museum	                0.3
# performing_arts_theater	0.05
# sculpture	                0.025

quant_df = score_df.withColumn("quant_score", 
                               (col("art_gallery") * 0.2 + col("art_studio") * 0.1 + 
                                col("auditorium") * 0.025 + col("cultural_landmark") * 0.15 + 
                                col("historical_place") * 0.05 + col("monument") * 0.1 + 
                                col("museum") * 0.3 + col("performing_arts_theater") * 0.05 + 
                                col("sculpture") * 0.025))

In [24]:
quant_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,...,museum,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium,log(ridership),pnorm(ridership),crowd_score,quant_score
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,2.0,5.0,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764,2.625
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,2.0,5.0,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764,2.625
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,2.0,5.0,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083,2.625
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,2.0,5.0,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083,2.625
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",...,2.0,5.0,3.0,5.0,2.0,0.0,8.140316,0.354277,6.457229,2.625


In [25]:
# Calculate Min and Max for Quantity
quant_min = quant_df.select(min(col("quant_score"))).collect()[0][0]
quant_max = quant_df.select(max(col("quant_score"))).collect()[0][0]

# Perform Normalization
quant_df = quant_df.withColumn("quant_score", 65 * (col("quant_score") - quant_min) / (quant_max - quant_min))

In [26]:
fig4 = px.histogram(quant_df, x="quant_score", 
                   title="Histogram of Quantity Score (2025)", 
                   labels={
                     "quant_score": "Normalized Quantity Score"
                     })
fig4.show()

In [27]:
# Source: https://stackoverflow.com/questions/45863360/curried-udf-pyspark
def udf_mult(c):
    def udf_inner(col):
        return udf(
            lambda a, b: a if (b != 0) else 0.0, DoubleType())(c, col)

    return  udf_inner

var_df = quant_df.withColumn("variety_score", 25 * (udf_mult(lit(0.2))(col("art_gallery")) + 
                                            udf_mult(lit(0.1))(col("art_studio")) + 
                                            udf_mult(lit(0.025))(col("auditorium")) + 
                                            udf_mult(lit(0.15))(col("cultural_landmark")) + 
                                            udf_mult(lit(0.05))(col("historical_place")) + 
                                            udf_mult(lit(0.1))(col("monument")) +
                                            udf_mult(lit(0.3))(col("museum")) + 
                                            udf_mult(lit(0.05))(col("performing_arts_theater")) + 
                                            udf_mult(lit(0.025))(col("sculpture"))))

In [28]:
var_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,...,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium,log(ridership),pnorm(ridership),crowd_score,quant_score,variety_score
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,5.0,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764,9.32377,24.375
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,5.0,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764,9.32377,24.375
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,5.0,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083,9.32377,24.375
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,5.0,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083,9.32377,24.375
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",...,5.0,3.0,5.0,2.0,0.0,8.140316,0.354277,6.457229,9.32377,24.375


In [29]:
fig5 = px.histogram(var_df, x="variety_score", 
                   title="Histogram of Variety Score (2025)", 
                   labels={
                     "variety_score": "Normalized Variety Score"
                     })
fig5.show()

In [30]:
scorecard = var_df.withColumn("avail_score", col("crowd_score") + col("quant_score") + col("variety_score"))

In [31]:
scorecard.columns

['station_complex',
 'station',
 'Route',
 'Borough',
 'GTFS Stop ID',
 'Division',
 'CBD',
 'total_ridership',
 'transit_date',
 'str_Coordinates',
 'cultural_landmark',
 'historical_place',
 'monument',
 'museum',
 'performing_arts_theater',
 'sculpture',
 'art_gallery',
 'art_studio',
 'auditorium',
 'log(ridership)',
 'pnorm(ridership)',
 'crowd_score',
 'quant_score',
 'variety_score',
 'avail_score']

In [32]:
scorecard.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,...,sculpture,art_gallery,art_studio,auditorium,log(ridership),pnorm(ridership),crowd_score,quant_score,variety_score,avail_score
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764,9.32377,24.375,40.811534
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,3.0,5.0,2.0,0.0,7.745003,0.288724,7.112764,9.32377,24.375,40.811534
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083,9.32377,24.375,40.460853
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,3.0,5.0,2.0,0.0,7.956477,0.323792,6.762083,9.32377,24.375,40.460853
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",...,3.0,5.0,2.0,0.0,8.140316,0.354277,6.457229,9.32377,24.375,40.156


In [33]:
scorecard.orderBy(col("station_complex")).where(col("station").contains(" St") & col("transit_date").contains("2025-01-07")).limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,...,sculpture,art_gallery,art_studio,auditorium,log(ridership),pnorm(ridership),crowd_score,quant_score,variety_score,avail_score
0,103 St (1),103 St,1,M,119,IRT,False,8693.0,2025-01-07T00:00:00.000,"(40.799446, -73.968379)",...,20.0,20.0,8.0,10.0,9.070273,0.508489,4.91511,34.808743,25.0,64.723853
1,103 St (6),103 St,6,M,624,IRT,False,9483.0,2025-01-07T00:00:00.000,"(40.7906, -73.947478)",...,13.0,20.0,8.0,2.0,9.157256,0.522913,4.770869,34.719945,25.0,64.490815
2,"103 St (C,B)",103 St,B,M,A18,IND,False,2978.0,2025-01-07T00:00:00.000,"(40.796092, -73.961454)",...,20.0,20.0,11.0,8.0,7.999007,0.330844,6.691556,41.379781,25.0,73.071338
3,"103 St (C,B)",103 St,C,M,A18,IND,False,2978.0,2025-01-07T00:00:00.000,"(40.796092, -73.961454)",...,20.0,20.0,11.0,8.0,7.999007,0.330844,6.691556,41.379781,25.0,73.071338
4,103 St-Corona Plaza (7),103 St-Corona Plaza,7,Q,706,IRT,False,18261.0,2025-01-07T00:00:00.000,"(40.749865, -73.8627)",...,6.0,4.0,1.0,0.0,9.812523,0.631574,3.684261,11.898907,24.375,39.958168


## Get stations and lines with best scores

In [34]:
from pyspark.sql.functions import desc, mean, median, asc
scores = scorecard.dropDuplicates(["station_complex"]).groupBy("station_complex", "str_Coordinates").agg(median('avail_score')).orderBy(desc(col("median(avail_score)")))
scores_by_line = scorecard.groupBy("Route").agg(median('avail_score')).orderBy(desc(col("median(avail_score)")))

In [35]:
scores.limit(10).toPandas()

Unnamed: 0,station_complex,str_Coordinates,median(avail_score)
0,Astor Pl (6),"(40.730054, -73.99107)",93.6624
1,"8 St-NYU (R,W)","(40.730328, -73.992629)",93.591716
2,18 St (1),"(40.74104, -73.997871)",93.365472
3,Canal St (1),"(40.722854, -74.006277)",93.013861
4,"Prince St (R,W)","(40.724329, -73.997702)",93.002626
5,"W 4 St-Wash Sq (A,C,E,B,D,F,M)","(40.732338, -74.000495)",92.585082
6,23 St (1),"(40.744081, -73.995657)",92.377252
7,"Spring St (C,E)","(40.726227, -74.003739)",92.349997
8,Houston St (1),"(40.728251, -74.005367)",92.344062
9,Spring St (6),"(40.722301, -73.997141)",92.161327


In [36]:
scores_by_line.limit(5).toPandas()

Unnamed: 0,Route,median(avail_score)
0,W,87.08596
1,1,83.435366
2,E,73.958519
3,C,72.719707
4,B,71.386228


## Get stations and lines with worst scores

In [37]:
scorecard.dropDuplicates(["station_complex"]).groupBy("station_complex", "str_Coordinates").agg(median('avail_score')).orderBy(asc(col("median(avail_score)"))).limit(10).toPandas()

Unnamed: 0,station_complex,str_Coordinates,median(avail_score)
0,Howard Beach-JFK Airport (A),"(40.660476, -73.830301)",6.596865
1,Beach 67 St (A),"(40.590927, -73.796924)",10.099821
2,Beach 60 St (A),"(40.592374, -73.788522)",10.657567
3,Beach 36 St (A),"(40.595398, -73.768175)",11.159734
4,Beach 44 St (A),"(40.592943, -73.776013)",11.811981
5,Far Rockaway-Mott Av (A),"(40.603995, -73.755405)",12.449973
6,Beach 25 St (A),"(40.600066, -73.761353)",12.953129
7,New Lots Av (3),"(40.666235, -73.884079)",13.583838
8,"Gun Hill Rd (2,5)","(40.869526, -73.846384)",13.986416
9,Gun Hill Rd (5),"(40.869526, -73.846384)",14.597133


In [38]:
scorecard.groupBy("Route").agg(median('avail_score')).orderBy(asc(col("median(avail_score)"))).limit(5).toPandas()

Unnamed: 0,Route,median(avail_score)
0,5,36.070984
1,A,37.865473
2,Z,39.663769
3,J,39.789277
4,D,40.261197


In [39]:
scores_by_line.limit(10).toPandas()

Unnamed: 0,Route,median(avail_score)
0,W,87.08596
1,1,83.435366
2,E,73.958519
3,C,72.719707
4,B,71.386228
5,3,70.324347
6,S,66.883624
7,G,65.631475
8,R,62.870059
9,M,57.543413


In [40]:
# Save data
# scores.write.csv('Score_Results',header=True)
# scores_by_line.write.csv('Score_by_Line', header=True)

In [41]:
fig6 = px.histogram(scores, x="median(avail_score)", 
                   title="Histogram of Creative Availbility Score by Station (2025)", 
                   labels={
                     "median(avail_score)": "Creative Availability Score (0-100)"
                     })
fig6.show()

In [42]:
fig7 = px.histogram(scores_by_line, x="median(avail_score)", 
                   title="Histogram of Creative Availbility Score by Subway Line (2025)", 
                   labels={
                     "median(avail_score)": "Creative Availability Score (0-100)"
                     }, nbins=25)
fig7.show()

# Plotting NYC Subway Lines

In [43]:
!pip install pipwin



In [44]:
!pip install descartes
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import descartes



In [45]:
from pyspark.sql import Row

# Dummy dataframe for GeoJSON
df = spark.createDataFrame([
    Row(borough="Manhattan", b=1),
    Row(borough="Bronx", b=2),
    Row(borough="Brookyln", b=3),
    Row(borough="Queens", b=4),
    Row(borough="Staten Island", b=5),
])

In [46]:
import json
import plotly.graph_objects as go
from collections import defaultdict

# Load NYC GeoJSON file from GitHub: https://github.com/nycehs/NYC_geography/blob/master/borough.geo.json
with open("Final_Data/borough.geo.json") as f:
    boroughs_geojson = json.load(f)

borough_names = [f["properties"]["BoroName"] for f in boroughs_geojson["features"]]
color_values = [1, 2, 3, 4, 5]  # dummy values

# Create base figure with boroughs
fig = go.Figure()

fig.add_trace(go.Choropleth(
    geojson=boroughs_geojson,
    locations=borough_names,
    z=color_values,
    featureidkey="properties.BoroName",
    colorscale=["lightgray", "darkgray"],
    marker_line_width=0.5,
    marker_line_color="white",
    showscale=False
))

# Update locations and layout
fig.update_geos(
    fitbounds="locations",
    visible=False
)

fig.update_layout(
    height=700,
    margin={"r":0,"t":0,"l":0,"b":0}
)

fig.show()


In [47]:
# Load subway station GeoJSON from data.ny.gov and colors
# Source: https://data.ny.gov/Transportation/MTA-Subway-Stations/39hk-dx4f/about_data
# Export as GeoJSON
with open("Final_Data/MTA Subway Stations_20250806.geo.json") as f:
    stations_geojson = json.load(f)

# MTA official colors
mta_colors = {
    "E": "#0039A6",
    "B": "#FF6319", 
    "W": "#FCCC0A",
}

In [48]:
stations_geojson["features"]

[{'type': 'Feature',
  'properties': {'station_id': '1',
   'north_direction_label': 'Last Stop',
   'line': 'Astoria',
   'daytime_routes': 'N W',
   'complex_id': '1',
   'division': 'BMT',
   'ada_southbound': '0',
   'gtfs_stop_id': 'R01',
   'structure': 'Elevated',
   'ada_notes': None,
   'stop_name': 'Astoria-Ditmars Blvd',
   'gtfs_longitude': '-73.912034',
   'ada_northbound': '0',
   'ada': '0',
   'south_direction_label': 'Manhattan',
   'cbd': 'False',
   'borough': 'Q',
   'gtfs_latitude': '40.775036'},
  'geometry': {'type': 'Point', 'coordinates': [-73.912034, 40.775036]}},
 {'type': 'Feature',
  'properties': {'station_id': '2',
   'north_direction_label': 'Astoria',
   'line': 'Astoria',
   'daytime_routes': 'N W',
   'complex_id': '2',
   'division': 'BMT',
   'ada_southbound': '1',
   'gtfs_stop_id': 'R03',
   'structure': 'Elevated',
   'ada_notes': None,
   'stop_name': 'Astoria Blvd',
   'gtfs_longitude': '-73.917843',
   'ada_northbound': '1',
   'ada': '1',
   

## Get scores from `scores` DataFrame

In [49]:
# UDF to split station name and routes
def get_station(name):
    return name.split("(")[0].strip()

udf_get_station = udf(get_station)

scores = scorecard.groupBy("station_complex", "str_Coordinates", "Route").agg(median('avail_score')).orderBy(desc(col("median(avail_score)")))
scores = scores.withColumn("stop_name", udf_get_station(col("station_complex")))

In [50]:
scores.limit(10).toPandas()

Unnamed: 0,station_complex,str_Coordinates,Route,median(avail_score),stop_name
0,"8 St-NYU (R,W)","(40.730328, -73.992629)",R,93.699621,8 St-NYU
1,"8 St-NYU (R,W)","(40.730328, -73.992629)",W,93.699621,8 St-NYU
2,Astor Pl (6),"(40.730054, -73.99107)",6,93.663846,Astor Pl
3,18 St (1),"(40.74104, -73.997871)",1,93.327617,18 St
4,Canal St (1),"(40.722854, -74.006277)",1,93.201805,Canal St
5,"Prince St (R,W)","(40.724329, -73.997702)",R,93.15547,Prince St
6,"Prince St (R,W)","(40.724329, -73.997702)",W,93.15547,Prince St
7,"W 4 St-Wash Sq (A,C,E,B,D,F,M)","(40.732338, -74.000495)",A,92.679114,W 4 St-Wash Sq
8,"W 4 St-Wash Sq (A,C,E,B,D,F,M)","(40.732338, -74.000495)",B,92.679114,W 4 St-Wash Sq
9,"W 4 St-Wash Sq (A,C,E,B,D,F,M)","(40.732338, -74.000495)",C,92.679114,W 4 St-Wash Sq


In [51]:
# Get score based on stop name and route
scores.filter( (col("stop_name").contains("Lexington"))).select("*").show()

+--------------------+--------------------+-----+-------------------+------------------+
|     station_complex|     str_Coordinates|Route|median(avail_score)|         stop_name|
+--------------------+--------------------+-----+-------------------+------------------+
|Lexington Av/63 S...|(40.764629, -73.9...|    F|  89.13723280700466|Lexington Av/63 St|
|Lexington Av/63 S...|(40.764629, -73.9...|    Q|  89.13723280700466|Lexington Av/63 St|
+--------------------+--------------------+-----+-------------------+------------------+



In [52]:
# Group stations by line
line_stations = {}
for feature in stations_geojson["features"]:
    coords = feature["geometry"]["coordinates"]  # [lon, lat]
    props = feature["properties"]
    routes = props["daytime_routes"].split()

    for route in routes:
        if route in mta_colors.keys():
            lon, lat = coords[0], coords[1]
            stop_name = props["stop_name"]
            size = 1
            try: 
                size = scores.filter( (col("stop_name") == stop_name) & (col("Route") == route)).select(col("median(avail_score)")).first()["median(avail_score)"]            
                size = size / 10.0
            except:
                size = 1
            val = (lon, lat, stop_name, size)

            print(val) # Debug statement
            if route in line_stations.keys():
                line_stations[route].append(val)
            else:
                line_stations[route] = [val]

(-73.912034, 40.775036, 'Astoria-Ditmars Blvd', 3.811295073782773)
(-73.917843, 40.770258, 'Astoria Blvd', 4.308446487582657)
(-73.921479, 40.766779, '30 Av', 4.547316187242545)
(-73.925508, 40.76182, 'Broadway', 5.041404864864728)
(-73.929575, 40.756804, '36 Av', 5.880039980794761)
(-73.932755, 40.752882, '39 Av-Dutch Kills', 6.5142092807510865)
(-73.967258, 40.76266, 'Lexington Av/59 St', 1)
(-73.973347, 40.764811, '5 Av/59 St', 9.029061414768796)
(-73.980658, 40.764664, '57 St-7 Av', 8.945167545202306)
(-73.984139, 40.759901, '49 St', 8.83681241630772)
(-73.986754, 40.754672, 'Times Sq-42 St', 8.425415042623767)
(-73.98795, 40.749567, '34 St-Herald Sq', 8.437716199485127)
(-73.988691, 40.745494, '28 St', 8.735463172185408)
(-73.989344, 40.741303, '23 St', 9.167824403491688)
(-73.990568, 40.735736, '14 St-Union Sq', 8.970487221533833)
(-73.992629, 40.730328, '8 St-NYU', 9.36996211223294)
(-73.997702, 40.724329, 'Prince St', 9.315547002248014)
(-74.001775, 40.719527, 'Canal St', 8.929

In [53]:
# Add subway lines and stations to fig
for route, stations in line_stations.items():
    if not stations:
        continue

    # # Sort roughly north-to-south
    stations.sort(key=lambda x: (-x[1], x[0]))
    lons = [s[0] for s in stations]
    lats = [s[1] for s in stations]
    names = [s[2] for s in stations]
    sizes = [s[3] for s in stations]

    # Subway line
    fig.add_trace(go.Scattergeo(
        lon=lons, lat=lats,
        mode="lines",
        line=dict(width=2, color=mta_colors[route]),
        name=f"Line {route}"
    ))

    # Subway stations
    fig.add_trace(go.Scattergeo(
        lon=lons, lat=lats,
        mode="markers",
        marker=dict(size=sizes, color=mta_colors[route]),
        showlegend = False
    ))

In [54]:
fig.show()

### Swap stations to correct route order

#### Fix W Line

In [55]:
line_stations["W"]

[(-73.912034, 40.775036, 'Astoria-Ditmars Blvd', 3.811295073782773),
 (-73.917843, 40.770258, 'Astoria Blvd', 4.308446487582657),
 (-73.921479, 40.766779, '30 Av', 4.547316187242545),
 (-73.973347, 40.764811, '5 Av/59 St', 9.029061414768796),
 (-73.980658, 40.764664, '57 St-7 Av', 8.945167545202306),
 (-73.967258, 40.76266, 'Lexington Av/59 St', 1),
 (-73.925508, 40.76182, 'Broadway', 5.041404864864728),
 (-73.984139, 40.759901, '49 St', 8.83681241630772),
 (-73.929575, 40.756804, '36 Av', 5.880039980794761),
 (-73.986754, 40.754672, 'Times Sq-42 St', 8.425415042623767),
 (-73.932755, 40.752882, '39 Av-Dutch Kills', 6.5142092807510865),
 (-73.940202, 40.750582, 'Queensboro Plaza', 6.506805224279235),
 (-73.98795, 40.749567, '34 St-Herald Sq', 8.437716199485127),
 (-73.988691, 40.745494, '28 St', 8.735463172185408),
 (-73.989344, 40.741303, '23 St', 9.167824403491688),
 (-73.990568, 40.735736, '14 St-Union Sq', 8.970487221533833),
 (-73.992629, 40.730328, '8 St-NYU', 9.36996211223294),


In [56]:
station = line_stations["W"][3]
line_stations["W"][3] = line_stations["W"][6] # Swap with 59th and 5th with Broadway
line_stations["W"][6] = station

station = line_stations["W"][4]
line_stations["W"][4] = line_stations["W"][8] # Swap with 36 ave
line_stations["W"][8] = station

station = line_stations["W"][5]
line_stations["W"][5] = line_stations["W"][10] # Swap with 39 ave
line_stations["W"][10] = station

station = line_stations["W"][6]
line_stations["W"][6] = line_stations["W"][11] # Swap with queensboro
line_stations["W"][11] = station

station = line_stations["W"][7]
line_stations["W"][7] = line_stations["W"][10] # Swap with 59th / lex
line_stations["W"][10] = station

station = line_stations["W"][8]
line_stations["W"][8] = line_stations["W"][11] # Swap with 59th / 5th
line_stations["W"][11] = station

station = line_stations["W"][9]
line_stations["W"][9] = line_stations["W"][10] # Swap with queensboro
line_stations["W"][10] = station

fournine = line_stations["W"][9]
timesquare = line_stations["W"][10]
seventhave = line_stations["W"][11]

line_stations["W"][9] = seventhave
line_stations["W"][10] = fournine
line_stations["W"][11] = timesquare

In [57]:
line_stations["W"]

[(-73.912034, 40.775036, 'Astoria-Ditmars Blvd', 3.811295073782773),
 (-73.917843, 40.770258, 'Astoria Blvd', 4.308446487582657),
 (-73.921479, 40.766779, '30 Av', 4.547316187242545),
 (-73.925508, 40.76182, 'Broadway', 5.041404864864728),
 (-73.929575, 40.756804, '36 Av', 5.880039980794761),
 (-73.932755, 40.752882, '39 Av-Dutch Kills', 6.5142092807510865),
 (-73.940202, 40.750582, 'Queensboro Plaza', 6.506805224279235),
 (-73.967258, 40.76266, 'Lexington Av/59 St', 1),
 (-73.973347, 40.764811, '5 Av/59 St', 9.029061414768796),
 (-73.980658, 40.764664, '57 St-7 Av', 8.945167545202306),
 (-73.984139, 40.759901, '49 St', 8.83681241630772),
 (-73.986754, 40.754672, 'Times Sq-42 St', 8.425415042623767),
 (-73.98795, 40.749567, '34 St-Herald Sq', 8.437716199485127),
 (-73.988691, 40.745494, '28 St', 8.735463172185408),
 (-73.989344, 40.741303, '23 St', 9.167824403491688),
 (-73.990568, 40.735736, '14 St-Union Sq', 8.970487221533833),
 (-73.992629, 40.730328, '8 St-NYU', 9.36996211223294),


#### Fix Line B

In [58]:
line_stations["B"]

[(-73.887138, 40.873244, 'Bedford Park Blvd', 3.9264310232068595),
 (-73.893509, 40.866978, 'Kingsbridge Rd', 3.8153488232407873),
 (-73.897749, 40.861296, 'Fordham Rd', 4.11182456819048),
 (-73.900741, 40.856093, '182-183 Sts', 3.6693076221204977),
 (-73.905227, 40.85041, 'Tremont Av', 3.4212941844789087),
 (-73.910136, 40.8459, '174-175 Sts', 3.4693050096252662),
 (-73.9134, 40.839306, '170 St', 3.8044903095287657),
 (-73.91844, 40.833771, '167 St', 3.8760763031246688),
 (-73.938209, 40.830135, '155 St', 6.920664796575929),
 (-73.925651, 40.827905, '161 St-Yankee Stadium', 5.551308296643725),
 (-73.944216, 40.824783, '145 St', 6.514920991812318),
 (-73.947649, 40.817894, '135 St', 7.138622752534118),
 (-73.952343, 40.811109, '125 St', 6.498970641373971),
 (-73.954882, 40.805085, '116 St', 7.283514956247069),
 (-73.958161, 40.800603, 'Cathedral Pkwy (110 St)', 1),
 (-73.961454, 40.796092, '103 St', 7.285450885681039),
 (-73.964696, 40.791642, '96 St', 8.17726281111453),
 (-73.968916, 

In [59]:
# Fix the B Line
onefivefive = line_stations["B"][8]
yankee = line_stations["B"][9]

line_stations["B"][8] = yankee
line_stations["B"][9] = onefivefive

#### Fix E Line

In [60]:
line_stations["E"]

[(-73.981637, 40.762862, '7 Av', 9.096009864988636),
 (-73.985984, 40.762456, '50 St', 8.948870889203043),
 (-73.975224, 40.760167, '5 Av/53 St', 9.039389290775622),
 (-73.969055, 40.757552, 'Lexington Av/53 St', 1),
 (-73.989735, 40.757308, '42 St-Port Authority Bus Terminal', 1),
 (-73.993391, 40.752287, '34 St-Penn Station', 8.393406752920015),
 (-73.937243, 40.748973, 'Queens Plaza', 6.287005922131483),
 (-73.946, 40.747846, 'Court Sq-23 St', 1),
 (-73.891338, 40.746644, 'Jackson Hts-Roosevelt Av', 1),
 (-73.998041, 40.745906, '23 St', 9.050352966097611),
 (-74.00169, 40.740893, '14 St', 1),
 (-74.000495, 40.732338, 'W 4 St-Wash Sq', 9.267911376046527),
 (-74.003739, 40.726227, 'Spring St', 9.252189822035962),
 (-73.844521, 40.721691, 'Forest Hills-71 Av', 3.363920487638474),
 (-74.005229, 40.720824, 'Canal St', 8.989529430872205),
 (-73.837324, 40.718331, '75 Av', 3.5638212083081298),
 (-73.831008, 40.714441, 'Kew Gardens-Union Tpke', 3.0150185608586804),
 (-74.009781, 40.712582, 

In [61]:
len(line_stations["E"])

22

In [62]:
parsons = line_stations["E"][20] # Stop 0
jfk = line_stations["E"][21]     # Stop 1...
vanwyck = line_stations["E"][19]
briarwood = line_stations["E"][18]
kewgardens = line_stations["E"][16]
sevenfive = line_stations["E"][15]
foresthills = line_stations["E"][13]
rooseveltave = line_stations["E"][8]
queensplaza = line_stations["E"][6]
courtsq = line_stations["E"][7]
fiftythreelex = line_stations["E"][3]
fiftythreefifth = line_stations["E"][2]
seventhave = line_stations["E"][0]
fiftystreet = line_stations["E"][1]
portauth = line_stations["E"][4]
pennstation = line_stations["E"][5]
twentythree = line_stations["E"][9]
fourteen = line_stations["E"][10]
washsquare = line_stations["E"][11]
spring = line_stations["E"][12]
canal = line_stations["E"][14]
wtc = line_stations["E"][17]

line_stations["E"][0] = parsons
line_stations["E"][1] = jfk
line_stations["E"][2] = vanwyck
line_stations["E"][3] = briarwood
line_stations["E"][4] = kewgardens
line_stations["E"][5] = sevenfive
line_stations["E"][6] = foresthills
line_stations["E"][7] = rooseveltave
line_stations["E"][8] = queensplaza
line_stations["E"][9] = courtsq
line_stations["E"][10] = fiftythreelex
line_stations["E"][11] = fiftythreefifth
line_stations["E"][12] = seventhave
line_stations["E"][13] = fiftystreet
line_stations["E"][14] = portauth
line_stations["E"][15] = pennstation
line_stations["E"][16] = twentythree
line_stations["E"][17] = fourteen
line_stations["E"][18] = washsquare
line_stations["E"][19] = spring
line_stations["E"][20] = canal
line_stations["E"][21] = wtc

In [63]:
# Plot NYC boroughs

import json
import plotly.graph_objects as go
from collections import defaultdict

# Load NYC GeoJSON file
with open("Final_Data/borough.geo.json") as f:
    boroughs_geojson = json.load(f)

borough_names = [f["properties"]["BoroName"] for f in boroughs_geojson["features"]]
color_values = [1, 2, 3, 4, 5]  # dummy values

# Create base fig
fig2 = go.Figure()

fig2.add_trace(go.Choropleth(
    geojson=boroughs_geojson,
    locations=borough_names,
    z=color_values,
    featureidkey="properties.BoroName",
    colorscale=["lightgray", "darkgray"],
    marker_line_width=0.5,
    marker_line_color="white",
    showscale=False
))

# Update locations and layout
fig2.update_geos(
    fitbounds="locations",
    visible=False
)
fig2.update_layout(
    height=700,
    margin={"r":0,"t":0,"l":0,"b":0}
)

In [64]:
line_stations.items()

dict_items([('W', [(-73.912034, 40.775036, 'Astoria-Ditmars Blvd', 3.811295073782773), (-73.917843, 40.770258, 'Astoria Blvd', 4.308446487582657), (-73.921479, 40.766779, '30 Av', 4.547316187242545), (-73.925508, 40.76182, 'Broadway', 5.041404864864728), (-73.929575, 40.756804, '36 Av', 5.880039980794761), (-73.932755, 40.752882, '39 Av-Dutch Kills', 6.5142092807510865), (-73.940202, 40.750582, 'Queensboro Plaza', 6.506805224279235), (-73.967258, 40.76266, 'Lexington Av/59 St', 1), (-73.973347, 40.764811, '5 Av/59 St', 9.029061414768796), (-73.980658, 40.764664, '57 St-7 Av', 8.945167545202306), (-73.984139, 40.759901, '49 St', 8.83681241630772), (-73.986754, 40.754672, 'Times Sq-42 St', 8.425415042623767), (-73.98795, 40.749567, '34 St-Herald Sq', 8.437716199485127), (-73.988691, 40.745494, '28 St', 8.735463172185408), (-73.989344, 40.741303, '23 St', 9.167824403491688), (-73.990568, 40.735736, '14 St-Union Sq', 8.970487221533833), (-73.992629, 40.730328, '8 St-NYU', 9.36996211223294)

In [65]:
# Add subway lines and stations
for route, stations in line_stations.items():
    if not stations:
        continue

    lons = [s[0] for s in stations]
    lats = [s[1] for s in stations]
    names = [s[2] for s in stations]
    sizes = [s[3] for s in stations]

    # Subway line
    fig2.add_trace(go.Scattergeo(
        lon=lons, lat=lats,
        mode="lines",
        line=dict(width=2, color=mta_colors[route]),
        name=f"Line {route}"
    ))

    # Subway stations
    fig2.add_trace(go.Scattergeo(
        lon=lons, lat=lats,
        mode="markers",
        marker=dict(size=sizes, color=mta_colors[route]),
        showlegend = False
    ))

fig2.show()

## Add routes with least creative availability

In [66]:
least_avail = {
    "A": "#000000",
    "Z": "#000000",
    "N": "#000000", 
} 

for feature in stations_geojson["features"]:
    coords = feature["geometry"]["coordinates"]  # [lon, lat]
    props = feature["properties"]
    routes = props["daytime_routes"].split()

    for route in routes:
        if route in least_avail.keys():
            lon, lat = coords[0], coords[1]
            stop_name = props["stop_name"]
            size = 1
            try: 
                size = scores.filter( (col("stop_name") == stop_name) & (col("Route") == route)).select(col("median(avail_score)")).first()["median(avail_score)"]            
                size = size / 10.0
            except:
                size = 1
            val = (lon, lat, stop_name, size)

            if route in line_stations.keys():
                line_stations[route].append(val)
            else:
                line_stations[route] = [val]

### Fix Line Z

In [67]:
line_stations["Z"]

[(-73.828294, 40.700492, '121 St', 2.5331691809809826),
 (-73.84433, 40.695178, '104 St', 1.889710738630982),
 (-73.851576, 40.693879, 'Woodhaven Blvd', 1.7510227327848995),
 (-73.867139, 40.691324, '75 St-Elderts Ln', 2.8534168375547964),
 (-73.873785, 40.683194, 'Crescent St', 2.876580666913992),
 (-73.880039, 40.68141, 'Norwood Av', 3.375617451020399),
 (-73.891688, 40.678024, 'Van Siclen Av', 3.277315483517816),
 (-73.898654, 40.676992, 'Alabama Av', 1),
 (-73.904512, 40.679498, 'Broadway Junction', 3.976875063008483),
 (-73.910456, 40.682893, 'Chauncey St', 5.891317197187838),
 (-73.92227, 40.68963, 'Gates Av', 5.965619631390252),
 (-73.935657, 40.697207, 'Myrtle Av', 6.3726930987301005),
 (-73.957757, 40.708359, 'Marcy Av', 6.297225787930145),
 (-73.987437, 40.718315, 'Delancey St-Essex St', 1),
 (-73.993915, 40.72028, 'Bowery', 9.188753423292876),
 (-73.999892, 40.718092, 'Canal St', 8.805328535930043),
 (-74.003401, 40.713243, 'Chambers St', 1),
 (-74.007582, 40.710374, 'Fulton

In [68]:
parsons = line_stations["Z"][-2]
jfk = line_stations["Z"][-1]
line_stations["Z"].insert(0, parsons)
line_stations["Z"].insert(1, jfk)
line_stations["Z"].pop(-2)
line_stations["Z"].pop(-1)

(-73.807969, 40.700486, 'Sutphin Blvd-Archer Av-JFK Airport', 3.10712054633823)

In [69]:
line_stations["Z"]

[(-73.801109, 40.702147, 'Jamaica Center-Parsons/Archer', 2.6859282547818006),
 (-73.807969,
  40.700486,
  'Sutphin Blvd-Archer Av-JFK Airport',
  3.10712054633823),
 (-73.828294, 40.700492, '121 St', 2.5331691809809826),
 (-73.84433, 40.695178, '104 St', 1.889710738630982),
 (-73.851576, 40.693879, 'Woodhaven Blvd', 1.7510227327848995),
 (-73.867139, 40.691324, '75 St-Elderts Ln', 2.8534168375547964),
 (-73.873785, 40.683194, 'Crescent St', 2.876580666913992),
 (-73.880039, 40.68141, 'Norwood Av', 3.375617451020399),
 (-73.891688, 40.678024, 'Van Siclen Av', 3.277315483517816),
 (-73.898654, 40.676992, 'Alabama Av', 1),
 (-73.904512, 40.679498, 'Broadway Junction', 3.976875063008483),
 (-73.910456, 40.682893, 'Chauncey St', 5.891317197187838),
 (-73.92227, 40.68963, 'Gates Av', 5.965619631390252),
 (-73.935657, 40.697207, 'Myrtle Av', 6.3726930987301005),
 (-73.957757, 40.708359, 'Marcy Av', 6.297225787930145),
 (-73.987437, 40.718315, 'Delancey St-Essex St', 1),
 (-73.993915, 40.720

### Fix Line N

In [70]:
line_stations["N"]

[(-73.912034, 40.775036, 'Astoria-Ditmars Blvd', 3.811295073782773),
 (-73.917843, 40.770258, 'Astoria Blvd', 4.308446487582657),
 (-73.921479, 40.766779, '30 Av', 4.547316187242545),
 (-73.925508, 40.76182, 'Broadway', 5.041404864864728),
 (-73.929575, 40.756804, '36 Av', 5.880039980794761),
 (-73.932755, 40.752882, '39 Av-Dutch Kills', 6.5142092807510865),
 (-73.967258, 40.76266, 'Lexington Av/59 St', 1),
 (-73.973347, 40.764811, '5 Av/59 St', 9.029061414768796),
 (-73.980658, 40.764664, '57 St-7 Av', 8.945167545202306),
 (-73.984139, 40.759901, '49 St', 8.83681241630772),
 (-73.986754, 40.754672, 'Times Sq-42 St', 8.425415042623767),
 (-73.98795, 40.749567, '34 St-Herald Sq', 8.437716199485127),
 (-73.990568, 40.735736, '14 St-Union Sq', 8.970487221533833),
 (-74.00046, 40.718383, 'Canal St', 8.805328535930043),
 (-73.97881, 40.683666, 'Atlantic Av-Barclays Ctr', 7.117495477196857),
 (-74.003549, 40.655144, '36 St', 5.308374636532221),
 (-74.017881, 40.641362, '59 St', 3.25924445520

In [71]:
len(line_stations["N"])

28

In [72]:
queensboro = line_stations["N"][-1]
line_stations["N"].insert(6, queensboro)
line_stations["N"].pop(-1)

coneyisland = line_stations["N"][18]
line_stations["N"].pop(18)
line_stations["N"].append(coneyisland)

In [73]:
line_stations["N"]

[(-73.912034, 40.775036, 'Astoria-Ditmars Blvd', 3.811295073782773),
 (-73.917843, 40.770258, 'Astoria Blvd', 4.308446487582657),
 (-73.921479, 40.766779, '30 Av', 4.547316187242545),
 (-73.925508, 40.76182, 'Broadway', 5.041404864864728),
 (-73.929575, 40.756804, '36 Av', 5.880039980794761),
 (-73.932755, 40.752882, '39 Av-Dutch Kills', 6.5142092807510865),
 (-73.940202, 40.750582, 'Queensboro Plaza', 6.506805224279235),
 (-73.967258, 40.76266, 'Lexington Av/59 St', 1),
 (-73.973347, 40.764811, '5 Av/59 St', 9.029061414768796),
 (-73.980658, 40.764664, '57 St-7 Av', 8.945167545202306),
 (-73.984139, 40.759901, '49 St', 8.83681241630772),
 (-73.986754, 40.754672, 'Times Sq-42 St', 8.425415042623767),
 (-73.98795, 40.749567, '34 St-Herald Sq', 8.437716199485127),
 (-73.990568, 40.735736, '14 St-Union Sq', 8.970487221533833),
 (-74.00046, 40.718383, 'Canal St', 8.805328535930043),
 (-73.97881, 40.683666, 'Atlantic Av-Barclays Ctr', 7.117495477196857),
 (-74.003549, 40.655144, '36 St', 5.

In [74]:
for route, stations in line_stations.items():
    if not stations:
        continue
    if route in mta_colors.keys():
        continue

    lons = [s[0] for s in stations]
    lats = [s[1] for s in stations]
    names = [s[2] for s in stations]
    sizes = [s[3] for s in stations]

    # Subway line
    fig2.add_trace(go.Scattergeo(
        lon=lons, lat=lats,
        mode="lines",
        line=dict(width=2, color=least_avail[route], dash = "dash"),
        name=f"Line {route}",
        opacity = 0.3
    ))

    # Subway stations
    fig2.add_trace(go.Scattergeo(
        lon=lons, lat=lats,
        mode="markers",
        marker=dict(size=sizes, color=least_avail[route]),
        showlegend = False,
        text=[s[2] for s in stations],  # stop names
        hoverinfo="text",               # only show text
        opacity = 0.3
    ))

In [75]:
fig2.show()