In [1]:
bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket")
url = "gs://" + bucket + "/data/chicago-taxi-trip/cleaned-chicago-taxi/"

In [2]:
from pyspark.sql.types import (StructType, 
                               StructField, 
                               DoubleType,
                               IntegerType,
                               StringType,
                               TimestampType)

taxi_schema = StructType([StructField('unique_key', StringType(), True),
                StructField('taxi_id', StringType(), True),
                StructField('trip_start_timestamp', TimestampType(), True), 
                StructField('trip_end_timestamp', TimestampType(), True), 
                StructField('trip_seconds', IntegerType(), True), 
                StructField('trip_miles', IntegerType(), True), 
                StructField('pickup_census_tract', StringType(), True), 
                StructField('dropoff_census_tract', StringType(), True), 
                StructField('pickup_community_area', IntegerType(), True), 
                StructField('dropoff_community_area', IntegerType(), True), 
                StructField('fare', IntegerType(), True), 
                StructField('tips', IntegerType(), True), 
                StructField('tolls', IntegerType(), True), 
                StructField('extras', IntegerType(), True), 
                StructField('trip_total', IntegerType(), True), 
                StructField('payment_type', StringType(), True), 
                StructField('company', StringType(), True), 
                StructField('pickup_latitude', StringType(), True), 
                StructField('pickup_longitude', StringType(), True), 
                StructField('pickup_location', StringType(), True), 
                StructField('dropoff_latitude', StringType(), True), 
                StructField('dropoff_longitude', StringType(), True), 
                StructField('dropoff_location', StringType(), True)])

taxi = spark.read.format("csv").option("header", "true").schema(taxi_schema).csv(url)

In [13]:
taxi.cache()

DataFrame[unique_key: string, taxi_id: string, trip_start_timestamp: timestamp, trip_end_timestamp: timestamp, trip_seconds: int, trip_miles: int, pickup_census_tract: string, dropoff_census_tract: string, pickup_community_area: int, dropoff_community_area: int, fare: int, tips: int, tolls: int, extras: int, trip_total: int, payment_type: string, company: string, pickup_latitude: string, pickup_longitude: string, pickup_location: string, dropoff_latitude: string, dropoff_longitude: string, dropoff_location: string]

In [3]:
taxi.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+-------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+----+-----+------+----------+-------------+--------------------+---------------+----------------+--------------------+----------------+-----------------+----------------+
|          unique_key|             taxi_id|trip_start_timestamp| trip_end_timestamp|trip_seconds|trip_miles|pickup_census_tract|dropoff_census_tract|pickup_community_area|dropoff_community_area|fare|tips|tolls|extras|trip_total| payment_type|             company|pickup_latitude|pickup_longitude|     pickup_location|dropoff_latitude|dropoff_longitude|dropoff_location|
+--------------------+--------------------+--------------------+-------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+----+-----+------+----------+-------------+------------------

                                                                                

In [4]:
taxi.describe().show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------+--------------------+--------------------+------------------+------------------+-------------------+--------------------+---------------------+----------------------+-----------------+------------------+-----+------+----------+-------------------+--------------------+--------------------+-------------------+--------------------+------------------+-----------------+----------------+
|summary|          unique_key|             taxi_id|      trip_seconds|        trip_miles|pickup_census_tract|dropoff_census_tract|pickup_community_area|dropoff_community_area|             fare|              tips|tolls|extras|trip_total|       payment_type|             company|     pickup_latitude|   pickup_longitude|     pickup_location|  dropoff_latitude|dropoff_longitude|dropoff_location|
+-------+--------------------+--------------------+------------------+------------------+-------------------+--------------------+---------------------+----------------------+-----------------+------------------+

                                                                                

In [5]:
taxi.describe().toPandas()

                                                                                

Unnamed: 0,summary,unique_key,taxi_id,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,count,20352161,20352161,2829328.0,20352161.0,20352161.0,5214529.0,15914800.0,16794907.0,19161717.0,...,0.0,0.0,17827660.0,17827660,17333531.0,17333531.0,17333531,20352161.0,0.0,0.0
1,mean,,,1.9476380963960347,26.07434694527033,21.03200981949779,17.394083147298634,0.9409646995249704,0.0048901134135485,1.220027307573742,...,,,-87.6724718774177,,41.88627384287559,-87.64957851664845,,17.208450051078113,,
2,stddev,,,5.370967954923644,23.637782212640044,19.07993369795151,24.61500310181596,2.176381276148622,0.4702432127410075,5.496590806089562,...,,,0.0927052730474986,,0.0493594264549925,0.057189298831089,,26.80213265681773,,
3,min,000001bde866596a38db452f082a0ed75b4829b1,0008de7a146802839c9e6059f482d292ebdae13c5c31dd...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,...,,,-87.530712484,POINT (-87.5307124836 41.7030053028),41.650221676,-87.531386257,POINT (-87.5313862567 41.7204632831),1.0,,
4,max,ffffff1aae5322736637e16dd2faecb5dfebe81a,fff84aa08ac78890c6e7da64b817cbd9aad6a124104e09...,950.0,77.0,9.0,99.0,500.0,1000.0,9555.0,...,,,-87.913624596,POINT (-87.913624596 41.9802643146),42.021223593,-87.913624596,POINT (-87.913624596 41.9802643146),999.0,,


In [12]:
from pyspark.sql.functions import col, sum
null_counts = taxi.select([sum(col(column).isNull().cast("int")).alias(column) for column in taxi.columns])



+----------+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+-------+--------+--------+--------+----------+------------+-------+---------------+----------------+---------------+----------------+-----------------+----------------+
|unique_key|taxi_id|trip_start_timestamp|trip_end_timestamp|trip_seconds|trip_miles|pickup_census_tract|dropoff_census_tract|pickup_community_area|dropoff_community_area|   fare|    tips|   tolls|  extras|trip_total|payment_type|company|pickup_latitude|pickup_longitude|pickup_location|dropoff_latitude|dropoff_longitude|dropoff_location|
+----------+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+-------+--------+--------+--------+----------+------------+-------+---------------+----------------+---------------+----------------+-----------------+-

                                                                                

In [None]:
null_counts.toPandas()

