In [1]:
bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket")
url = "gs://" + bucket + "/data/chicago-taxi-trip/cleaned-chicago-taxi/"

In [2]:
from pyspark.sql.types import (StructType, 
                               StructField, 
                               DoubleType,
                               IntegerType,
                               StringType,
                               TimestampType)

taxi_schema = StructType([StructField('unique_key', StringType(), True),
                StructField('taxi_id', StringType(), True),
                StructField('trip_start_timestamp', TimestampType(), True), 
                StructField('trip_end_timestamp', TimestampType(), True), 
                StructField('trip_seconds', IntegerType(), True), 
                StructField('trip_miles', IntegerType(), True), 
                StructField('pickup_community_area', IntegerType(), True), 
                StructField('dropoff_community_area', IntegerType(), True), 
                StructField('fare', IntegerType(), True), 
                StructField('tips', IntegerType(), True), 
                StructField('tolls', IntegerType(), True), 
                StructField('extras', IntegerType(), True), 
                StructField('trip_total', IntegerType(), True), 
                StructField('payment_type', StringType(), True), 
                StructField('company', StringType(), True), 
                StructField('pickup_latitude', StringType(), True), 
                StructField('pickup_longitude', StringType(), True), 
                StructField('pickup_location', StringType(), True), 
                StructField('dropoff_latitude', StringType(), True), 
                StructField('dropoff_longitude', StringType(), True), 
                StructField('dropoff_location', StringType(), True)])

taxi = spark.read.format("csv").option("header", "true").schema(taxi_schema).csv(url)

In [3]:
taxi.cache()

DataFrame[unique_key: string, taxi_id: string, trip_start_timestamp: timestamp, trip_end_timestamp: timestamp, trip_seconds: int, trip_miles: int, pickup_census_tract: string, dropoff_census_tract: string, pickup_community_area: int, dropoff_community_area: int, fare: int, tips: int, tolls: int, extras: int, trip_total: int, payment_type: string, company: string, pickup_latitude: string, pickup_longitude: string, pickup_location: string, dropoff_latitude: string, dropoff_longitude: string, dropoff_location: string]

In [4]:
taxi.count()

                                                                                

20352161

In [7]:
taxi.limit(5).toPandas()

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,fab9854c1658fa94bdbc2cb4f560586c1c0cc97b,fb0ce19e30e712c77c57cfdb6ef729c2d2ad73225d9ec3...,2021-04-11 15:15:00,2021-04-11 15:30:00,,8,28,,0.0,0,...,,,-87.633308037,POINT (-87.6333080367 41.899602111),41.874005383,-87.66351755,POINT (-87.6635175498 41.874005383),11,,
1,f5274bb8426501bce7abecc9e02e6d2c9b434108,175a58299af3a8c883453259b9a0cfc45ccbbbe1a07883...,2021-04-18 01:00:00,2021-04-18 01:15:00,,8,28,,,0,...,,,-87.633308037,POINT (-87.6333080367 41.899602111),41.874005383,-87.66351755,POINT (-87.6635175498 41.874005383),16,,
2,eeee3d40ecc76c0d2a3eab86541f3b9d573b3b05,72acb561bc3ff52febfb74f963b7e3a637c4a5de6f1360...,2021-04-05 22:45:00,2021-04-05 23:00:00,,8,28,,,0,...,,,-87.633308037,POINT (-87.6333080367 41.899602111),41.874005383,-87.66351755,POINT (-87.6635175498 41.874005383),10,,
3,fa294ff40f36a5e72bcd349a1303bd9a30c253bc,f45c4bfa9d6c445eb03ab69093f7ec1e9cdecef83cb2ce...,2021-04-29 20:30:00,2021-04-29 20:45:00,,8,28,9.0,0.0,0,...,,,-87.633308037,POINT (-87.6333080367 41.899602111),41.874005383,-87.66351755,POINT (-87.6635175498 41.874005383),12,,
4,f714fd0ef05f1ff3e09203d1d216ea7b015d4ced,322630536b6fae63312b46cb1ddab0df9493b49a3eb5de...,2021-04-11 18:15:00,2021-04-11 18:30:00,,8,28,9.0,0.0,0,...,,,-87.633308037,POINT (-87.6333080367 41.899602111),41.874005383,-87.66351755,POINT (-87.6635175498 41.874005383),12,,


In [8]:
taxi.describe().toPandas()

                                                                                

Unnamed: 0,summary,unique_key,taxi_id,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,count,20352161,20352161,2829328.0,20352161.0,20352161.0,5214529.0,15914800.0,16794907.0,19161717.0,...,0.0,0.0,17827660.0,17827660,17333531.0,17333531.0,17333531,20352161.0,0.0,0.0
1,mean,,,1.9476380963960347,26.07434694527033,21.03200981949779,17.394083147298634,0.9409646995249704,0.0048901134135485,1.220027307573742,...,,,-87.6724718774177,,41.88627384287557,-87.64957851664848,,17.208450051078113,,
2,stddev,,,5.370967954923643,23.63778221264003,19.07993369795151,24.61500310181596,2.1763812761486228,0.4702432127410074,5.496590806089561,...,,,0.0927052730474983,,0.0493594264549918,0.0571892988310892,,26.80213265681773,,
3,min,000001bde866596a38db452f082a0ed75b4829b1,0008de7a146802839c9e6059f482d292ebdae13c5c31dd...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,...,,,-87.530712484,POINT (-87.5307124836 41.7030053028),41.650221676,-87.531386257,POINT (-87.5313862567 41.7204632831),1.0,,
4,max,ffffff1aae5322736637e16dd2faecb5dfebe81a,fff84aa08ac78890c6e7da64b817cbd9aad6a124104e09...,950.0,77.0,9.0,99.0,500.0,1000.0,9555.0,...,,,-87.913624596,POINT (-87.913624596 41.9802643146),42.021223593,-87.913624596,POINT (-87.913624596 41.9802643146),999.0,,


In [9]:
from pyspark.sql.functions import col, sum
null_counts = taxi.select([sum(col(column).isNull().cast("int")).alias(column) for column in taxi.columns]).toPandas()

                                                                                

In [14]:
null_counts.toPandas()

                                                                                

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,0,0,0,0,17522833,0,0,15137632,4437361,3557254,...,20352161,20352161,2524501,2524501,3018630,3018630,3018630,0,20352161,20352161
