In [2]:
# start a Spark session
import pyspark

session = pyspark.sql.SparkSession
# use all available memory for the driver
spark = session.builder.config('spark.driver.memory','10g').getOrCreate()
spark

In [3]:
# Set dataset paths
stations_path = "mta_open_data/station_by_line.csv"

# Function to read in CSV files
def read_data(path):
    df = spark.read\
              .option("header", True)\
              .csv(path)
    df.printSchema()
    return df

In [4]:
stations_by_line = read_data(stations_path)

root
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- Stop Name: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- CBD: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- GTFS Latitude: string (nullable = true)
 |-- GTFS Longitude: string (nullable = true)



In [5]:
stations_by_line.limit(5).toPandas()

Unnamed: 0,GTFS Stop ID,Division,Stop Name,Borough,CBD,Route,GTFS Latitude,GTFS Longitude
0,R01,BMT,Astoria-Ditmars Blvd,Q,False,N,40.775036,-73.912034
1,R01,BMT,Astoria-Ditmars Blvd,Q,False,W,40.775036,-73.912034
2,R03,BMT,Astoria Blvd,Q,False,N,40.770258,-73.917843
3,R03,BMT,Astoria Blvd,Q,False,W,40.770258,-73.917843
4,R04,BMT,30 Av,Q,False,N,40.766779,-73.921479


In [6]:
stations_by_line = stations_by_line.withColumnRenamed("Stop Name", "StopName")

In [7]:
import json
# Get subway ridership data
ridership_path = "mta_open_data/subway_ridership_sample_2025.json"

riders = spark.read.json(ridership_path)
riders.printSchema()

root
 |-- station_complex: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)



In [8]:
riders.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date
0,103 St (1),8693.0,2025-01-07T00:00:00.000
1,103 St (6),9483.0,2025-01-07T00:00:00.000
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000
3,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000
4,104 St (A),1238.0,2025-01-07T00:00:00.000


In [9]:
from pyspark.sql.functions import col, expr, udf
from pyspark.sql.types import ArrayType, StringType

# UDF to split station name and routes
def get_station(name):
    return name.split("(")[0].strip()

def get_routes(name):
    return name.split("(")[1].strip(")").split(",")

udf_get_station = udf(get_station)
udf_get_routes = udf(get_routes, returnType = ArrayType(StringType())) ## Must return array type to explode

In [10]:
riders = riders.withColumn("station", udf_get_station(col("station_complex")))\
               .withColumn("routes", udf_get_routes(col("station_complex")))

In [11]:
riders.printSchema()

root
 |-- station_complex: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- station: string (nullable = true)
 |-- routes: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [12]:
riders.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,routes
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,[1]
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,[6]
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,"[C, B]"
3,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,[7]
4,104 St (A),1238.0,2025-01-07T00:00:00.000,104 St,[A]


In [13]:
from pyspark.sql.functions import explode
riders = riders.withColumn("routes", explode(col("routes")))

In [14]:
riders.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,routes
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,1
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,6
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,C
3,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,B
4,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,7


In [15]:
## Convert back to string type for join
riders = riders.withColumn("routes", col("routes").cast(StringType()))

In [16]:
riders.printSchema()

root
 |-- station_complex: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- station: string (nullable = true)
 |-- routes: string (nullable = true)



In [17]:
riders.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,routes
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,1
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,6
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,C
3,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,B
4,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,7


In [18]:
crowdedness = riders.join(stations_by_line, 
                          on=[
                          riders.station == stations_by_line.StopName,
                          riders.routes == stations_by_line.Route
                          ], how='inner')

In [19]:
crowdedness.printSchema()

root
 |-- station_complex: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- station: string (nullable = true)
 |-- routes: string (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- StopName: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- CBD: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- GTFS Latitude: string (nullable = true)
 |-- GTFS Longitude: string (nullable = true)



In [20]:
crowdedness.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,routes,GTFS Stop ID,Division,StopName,Borough,CBD,Route,GTFS Latitude,GTFS Longitude
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,1,119,IRT,103 St,M,False,1,40.799446,-73.968379
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,6,624,IRT,103 St,M,False,6,40.7906,-73.947478
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,C,A18,IND,103 St,M,False,C,40.796092,-73.961454
3,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,B,A18,IND,103 St,M,False,B,40.796092,-73.961454
4,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,7,706,IRT,103 St-Corona Plaza,Q,False,7,40.749865,-73.8627


In [21]:
columns = ["station_complex", "total_ridership", "transit_date", "station", "Route", 
           "GTFS Stop ID", "Division", "Borough", "CBD", "GTFS Latitude", "GTFS Longitude"]
crowdedness = crowdedness.select(columns)

In [22]:
# Save file
# crowdedness.write.csv('Sample_Crowdedness',header=True)

## Join crowdedness data with Google Places data

In [23]:
# Get crowdedness data
crowd_path = "mta_open_data/crowdedness_sample_2025.csv"

crowdedness = read_data(crowd_path)

root
 |-- station_complex: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- station: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- CBD: string (nullable = true)
 |-- GTFS Latitude: string (nullable = true)
 |-- GTFS Longitude: string (nullable = true)



In [24]:
crowdedness.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,Route,GTFS Stop ID,Division,Borough,CBD,GTFS Latitude,GTFS Longitude
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,1,119,IRT,M,False,40.799446,-73.968379
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,6,624,IRT,M,False,40.7906,-73.947478
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,C,A18,IND,M,False,40.796092,-73.961454
3,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,B,A18,IND,M,False,40.796092,-73.961454
4,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,7,706,IRT,Q,False,40.749865,-73.8627


In [25]:
import json

# Get Google Places API data (cultural destinations)
places_path = "places_api_data/outputFile_converted.json"

places = spark.read.json(places_path)
places.printSchema()

root
 |-- Coordinates: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- Nearby: struct (nullable = true)
 |    |-- art_gallery: struct (nullable = true)
 |    |    |-- places: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- display_name: struct (nullable = true)
 |    |    |    |    |    |-- language_code: string (nullable = true)
 |    |    |    |    |    |-- text: string (nullable = true)
 |    |    |    |    |-- formatted_address: string (nullable = true)
 |    |    |    |    |-- location: struct (nullable = true)
 |    |    |    |    |    |-- latitude: double (nullable = true)
 |    |    |    |    |    |-- longitude: double (nullable = true)
 |    |-- art_studio: struct (nullable = true)
 |    |    |-- places: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- display_name: struct (nullable = true)
 |    |    |    |    |    |-- language_code

In [26]:
places.limit(5).show()

+--------------------+--------------------+---------------+--------------------+
|         Coordinates|              Nearby|   Station Name|                name|
+--------------------+--------------------+---------------+--------------------+
|[40.512764, -74.2...|{{[{{en, Cape Hou...|    Tottenville|(40.512764, -74.2...|
|[40.516578, -74.2...|{{[{{en, Cape Hou...|    Arthur Kill|(40.516578, -74.2...|
|[40.519631, -74.2...|{{NULL}, {[{{en, ...|Richmond Valley|(40.519631, -74.2...|
|[40.52241, -74.21...|{{NULL}, {[{{en, ...|Pleasant Plains|(40.52241, -74.21...|
|[40.525507, -74.2...|{{NULL}, {NULL}, ...|   Prince's Bay|(40.525507, -74.2...|
+--------------------+--------------------+---------------+--------------------+



In [27]:
## Note: places data includes SIR stations, crowdedness does not

## Create new coordinates column in crowdedness dataset

In [28]:
from pyspark.sql.functions import udf

# Function to create string of coordinates
def get_str_coord(lat, lng):
    return "(" + lat + ", " + lng + ")"

udf_str_coord = udf(get_str_coord)

In [29]:
crowds = crowdedness.withColumn("str_Coordinates", udf_str_coord("GTFS Latitude", "GTFS Longitude"))

In [30]:
crowds.limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,Route,GTFS Stop ID,Division,Borough,CBD,GTFS Latitude,GTFS Longitude,str_Coordinates
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,1,119,IRT,M,False,40.799446,-73.968379,"(40.799446, -73.968379)"
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,6,624,IRT,M,False,40.7906,-73.947478,"(40.7906, -73.947478)"
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,C,A18,IND,M,False,40.796092,-73.961454,"(40.796092, -73.961454)"
3,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,B,A18,IND,M,False,40.796092,-73.961454,"(40.796092, -73.961454)"
4,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,7,706,IRT,Q,False,40.749865,-73.8627,"(40.749865, -73.8627)"


In [45]:
final_df = crowds.join(places, 
                       on = crowds.str_Coordinates == places.name,
                       how = 'inner')

In [46]:
final_df.count()

13467

In [47]:
crowds.count()

13467

In [48]:
len(final_df.columns)

16

In [49]:
final_df.orderBy(col("station_complex")).where(col("station").contains("103 St") & col("transit_date").contains("2025-01-07")).limit(5).toPandas()

Unnamed: 0,station_complex,total_ridership,transit_date,station,Route,GTFS Stop ID,Division,Borough,CBD,GTFS Latitude,GTFS Longitude,str_Coordinates,Coordinates,Nearby,Station Name,name
0,103 St (1),8693.0,2025-01-07T00:00:00.000,103 St,1,119,IRT,M,False,40.799446,-73.968379,"(40.799446, -73.968379)","[40.799446, -73.968379]","(([Row(display_name=Row(language_code='en', te...",103 St,"(40.799446, -73.968379)"
1,103 St (6),9483.0,2025-01-07T00:00:00.000,103 St,6,624,IRT,M,False,40.7906,-73.947478,"(40.7906, -73.947478)","[40.7906, -73.947478]","(([Row(display_name=Row(language_code='en', te...",103 St,"(40.7906, -73.947478)"
2,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,B,A18,IND,M,False,40.796092,-73.961454,"(40.796092, -73.961454)","[40.796092, -73.961454]","(([Row(display_name=Row(language_code='en', te...",103 St,"(40.796092, -73.961454)"
3,"103 St (C,B)",2978.0,2025-01-07T00:00:00.000,103 St,C,A18,IND,M,False,40.796092,-73.961454,"(40.796092, -73.961454)","[40.796092, -73.961454]","(([Row(display_name=Row(language_code='en', te...",103 St,"(40.796092, -73.961454)"
4,103 St-Corona Plaza (7),18261.0,2025-01-07T00:00:00.000,103 St-Corona Plaza,7,706,IRT,Q,False,40.749865,-73.8627,"(40.749865, -73.8627)","[40.749865, -73.8627]","(([Row(display_name=Row(language_code='en', te...",103 St-Corona Plaza,"(40.749865, -73.8627)"


In [50]:
final_df.printSchema()

root
 |-- station_complex: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- station: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- CBD: string (nullable = true)
 |-- GTFS Latitude: string (nullable = true)
 |-- GTFS Longitude: string (nullable = true)
 |-- str_Coordinates: string (nullable = true)
 |-- Coordinates: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- Nearby: struct (nullable = true)
 |    |-- art_gallery: struct (nullable = true)
 |    |    |-- places: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- display_name: struct (nullable = true)
 |    |    |    |    |    |-- language_code: string (nullable = true)
 |    |    |    |    |    |-- text: string (nullable = true)
 |    |  

In [51]:
columns = ["station_complex", "station", "Route", "Borough", "GTFS Stop ID", 
           "Division", "CBD", "total_ridership", "transit_date", "str_Coordinates", 
           "Coordinates", "Nearby"]

final_df = final_df.select(columns)

In [52]:
final_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,Coordinates,Nearby
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)","[40.576127, -73.975939]","(([Row(display_name=Row(language_code='en', te..."
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)","[40.576127, -73.975939]","(([Row(display_name=Row(language_code='en', te..."
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)","[40.576127, -73.975939]","(([Row(display_name=Row(language_code='en', te..."
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)","[40.576127, -73.975939]","(([Row(display_name=Row(language_code='en', te..."
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)","[40.576127, -73.975939]","(([Row(display_name=Row(language_code='en', te..."


## Explode Nearby column from JSON to individual columns
- NOTE: don't need `from_json` because that function takes a string rather than parsed JSON

In [53]:
places.printSchema()

root
 |-- Coordinates: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- Nearby: struct (nullable = true)
 |    |-- art_gallery: struct (nullable = true)
 |    |    |-- places: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- display_name: struct (nullable = true)
 |    |    |    |    |    |-- language_code: string (nullable = true)
 |    |    |    |    |    |-- text: string (nullable = true)
 |    |    |    |    |-- formatted_address: string (nullable = true)
 |    |    |    |    |-- location: struct (nullable = true)
 |    |    |    |    |    |-- latitude: double (nullable = true)
 |    |    |    |    |    |-- longitude: double (nullable = true)
 |    |-- art_studio: struct (nullable = true)
 |    |    |-- places: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- display_name: struct (nullable = true)
 |    |    |    |    |    |-- language_code

In [54]:
from pyspark.sql.functions import size

# Get count of places
final_df = final_df.withColumn('art_gallery', size(final_df['Nearby']['art_gallery']['places']))\
                   .withColumn('art_studio', size(final_df['Nearby']['art_studio']['places']))\
                   .withColumn('auditorium', size(final_df['Nearby']['auditorium']['places']))\
                   .withColumn('cultural_landmark', size(final_df['Nearby']['cultural_landmark']['places']))\
                   .withColumn('historical_place', size(final_df['Nearby']['historical_place']['places']))\
                   .withColumn('monument', size(final_df['Nearby']['monument']['places']))\
                   .withColumn('museum', size(final_df['Nearby']['museum']['places']))\
                   .withColumn('performing_arts_theater', size(final_df['Nearby']['performing_arts_theater']['places']))\
                   .withColumn('sculpture', size(final_df['Nearby']['sculpture']['places']))

In [55]:
final_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,...,Nearby,art_gallery,art_studio,auditorium,cultural_landmark,historical_place,monument,museum,performing_arts_theater,sculpture
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,"(([Row(display_name=Row(language_code='en', te...",5,2,-1,1,5,1,2,5,3
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",...,"(([Row(display_name=Row(language_code='en', te...",5,2,-1,1,5,1,2,5,3
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,"(([Row(display_name=Row(language_code='en', te...",5,2,-1,1,5,1,2,5,3
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",...,"(([Row(display_name=Row(language_code='en', te...",5,2,-1,1,5,1,2,5,3
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",...,"(([Row(display_name=Row(language_code='en', te...",5,2,-1,1,5,1,2,5,3


**NOTE:** NULL = count is -1

In [56]:
final_df = final_df.drop("Nearby", "Coordinates") # Drop redundant, complex type columns
final_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,art_gallery,art_studio,auditorium,cultural_landmark,historical_place,monument,museum,performing_arts_theater,sculpture
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",5,2,-1,1,5,1,2,5,3
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",5,2,-1,1,5,1,2,5,3
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",5,2,-1,1,5,1,2,5,3
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",5,2,-1,1,5,1,2,5,3
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",5,2,-1,1,5,1,2,5,3


In [186]:
# Source: https://stackoverflow.com/questions/38611418/writing-a-csv-with-column-names-and-reading-a-csv-file-which-is-being-generated
# Save data as CSV file
final_df.write.csv('Joined_Data',header=True)

In [57]:
# Set dataset paths
path = "final_joined_data.csv"

# Function to read in CSV files
def read_data(path):
    df = spark.read\
              .option("header", True)\
              .csv(path)
    df.printSchema()
    return df

In [58]:
final_df = read_data(path)
final_df.limit(5).toPandas()

root
 |-- station_complex: string (nullable = true)
 |-- station: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- CBD: string (nullable = true)
 |-- total_ridership: string (nullable = true)
 |-- transit_date: string (nullable = true)
 |-- str_Coordinates: string (nullable = true)
 |-- cultural_landmark: string (nullable = true)
 |-- historical_place: string (nullable = true)
 |-- monument: string (nullable = true)
 |-- museum: string (nullable = true)
 |-- performing_arts_theater: string (nullable = true)
 |-- sculpture: string (nullable = true)
 |-- art_gallery: string (nullable = true)
 |-- art_studio: string (nullable = true)
 |-- auditorium: string (nullable = true)



Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,cultural_landmark,historical_place,monument,museum,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0


In [59]:
from pyspark.sql.functions import col, when

## Replace -1 with 0 (-1 meant not found)
columns = ["art_gallery", "art_studio", "auditorium", "cultural_landmark", "historical_place", 
           "monument", "museum", "performing_arts_theater", "sculpture"]

for column in columns:
    final_df = final_df.withColumn(column, when((col(column) == -1), 0).otherwise(col(column)))

In [60]:
final_df.limit(5).toPandas()

Unnamed: 0,station_complex,station,Route,Borough,GTFS Stop ID,Division,CBD,total_ridership,transit_date,str_Coordinates,cultural_landmark,historical_place,monument,museum,performing_arts_theater,sculpture,art_gallery,art_studio,auditorium
0,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
1,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2310.0,2025-07-10T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
2,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
3,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,F,Bk,D42,BMT,False,2854.0,2025-07-09T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
4,"W 8 St-NY Aquarium (F,Q)",W 8 St-NY Aquarium,Q,Bk,D42,BMT,False,3430.0,2025-07-05T00:00:00.000,"(40.576127, -73.975939)",1.0,5.0,1.0,2.0,5.0,3.0,5.0,2.0,0.0
