In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



### Install PySpark

In [None]:
!pip install pyspark

### PySpark Package Import

In [None]:
#PySpark Package Import

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import time

from pyspark.sql.functions import monotonically_increasing_id,broadcast

In [None]:
my_spark = SparkSession.builder.getOrCreate()
my_spark

### PySpark Version

In [None]:
print('Version',my_spark.version)

### Importing Files

In [None]:
%%time
flights = my_spark.read.csv('/kaggle/input/flight-delays/flights.csv',header=True)
airports = my_spark.read.csv('/kaggle/input/flight-delays/airports.csv',header=True)
airlines = my_spark.read.csv('/kaggle/input/flight-delays/airlines.csv',header=True)


### Printing the Schema

In [None]:
flights.printSchema()

### Type

In [None]:
type(flights)

### Column Information

In [None]:
flights.select('DISTANCE').dtypes

### Converting String to Integer

In [None]:
flights = flights.withColumn('DISTANCE',flights['DISTANCE'].cast('integer'))
flights

### Show top 5 data in the dataframe

In [None]:
flights.show(5)

### Top 5 records in Airport

In [None]:
airports.show(5)

### Top 5 data in Airlines

In [None]:
airlines.show(5)

### Creating a new Column

In [None]:
flights = flights.withColumn('duration_hrs',flights.AIR_TIME/60)
flights.show(5)

### Filtering Data in PySpark
We will filter data based on Distance>1000

In [None]:
dist_flights = flights.filter('DISTANCE>1000')
dist_flights.show(1)

### Select Specific Column Data

In [None]:
dist_col = dist_flights.select('YEAR','MONTH','FLIGHT_NUMBER','TAIL_NUMBER','ORIGIN_AIRPORT','DESTINATION_AIRPORT','AIR_TIME',
                               'DISTANCE')
dist_col.show(5)

### Select with Filter based on Column Values

In [None]:
dist_col.filter(dist_col.DESTINATION_AIRPORT=='PBI').show(5)

### Select with Filter based on Multiple Column Values¶

In [None]:
dist_col.filter(dist_col.ORIGIN_AIRPORT=='JFK').filter(dist_col.DESTINATION_AIRPORT=='PBI').show(5)

### Select Column With Expression

In [None]:
dist_col.selectExpr('YEAR','MONTH','FLIGHT_NUMBER','TAIL_NUMBER','ORIGIN_AIRPORT',
                    'DESTINATION_AIRPORT','AIR_TIME','DISTANCE','DISTANCE/(AIR_TIME /60)as Average_Speed').show(5)

### Count on the Entire dataset

In [None]:
dist_col.count()

### Groupby Columns and doing a Count

In [None]:
dist_col.filter(dist_col.ORIGIN_AIRPORT=='SEA').groupby('ORIGIN_AIRPORT').count().show()

### Aggregating Columns With

- Minimum
- Maximum
- Average
- Sum

In [None]:
#Converting Column Type using cast
dist_col=dist_col.withColumn('AIR_TIME',dist_col['AIR_TIME'].cast('integer'))
dist_col

In [None]:
#Minimum value
dist_col.select('ORIGIN_AIRPORT','DISTANCE','AIR_TIME').groupby().min('DISTANCE').show()

In [None]:
# Maximum Value

dist_col.select('DISTANCE').groupby().max().show()



In [None]:
dist_col.select('DISTANCE').groupby().avg().show()

### Another way to use the aggregation on Column

In [None]:
dist_col.groupby().sum('DISTANCE').collect()[0][0]

### Count from Particular Origin to Multiple Destination

In [None]:
dist_col.filter(dist_col.ORIGIN_AIRPORT=='SEA').groupby('DESTINATION_AIRPORT').count().show(5)

### Groupby on multiple Columns

In [None]:
month_df = dist_col.groupBy('MONTH','ORIGIN_AIRPORT')
month_df.avg('DISTANCE').show(5)

### .agg in PySpark.SQL.functions

In [None]:
month_df.agg(F.mean('DISTANCE')).show(5)

### Checking Number of Paritions for the dataframe

In [None]:
flights.rdd.getNumPartitions()

### Filtering Columns with PySpark

In [None]:
airports.select(airports['AIRPORT']).distinct().show(5)

### Filter based on Length of Airport Name¶

In [None]:
airports.filter('length(AIRPORT)<15').show()

### Filter so that AIRPORT name should not contain Airport in its name

In [None]:
airports.filter(~F.column('AIRPORT').contains('Airport')).show()

#### Conditional Statement Execution in PySpark DataFrame

In [None]:
airports.withColumn('State Name',F.when(airports.STATE=='TX','Texas')).show(5)

### Multiple When Statement in PySpark

In [None]:
airports.withColumn('Flag',F.when(airports.STATE=='TX','Texas').when(airports.STATE=='GA','Georgio')
                    .otherwise('N/A')).show(5)

### Monotinically Increasing IDs

In [None]:
airports.withColumn('ID',monotonically_increasing_id()).show()

### Caching in Spark

In [None]:
start_time = time.time()

dest_cache = flights.select('DESTINATION_AIRPORT').cache()
print('First Call to cache',dest_cache,time.time()-start_time)
second_time = time.time()
print('Second Call to the dataframe',dest_cache,time.time()-second_time)

### Clearing the Cache

In [None]:
print('Is the dataframe Cached?',dest_cache.is_cached)
dest_cache.unpersist()

print('Is the dataframe Cached?',dest_cache.is_cached)

### Joining two Dataframe

In [None]:
start_time = time.time()
df = airports.join(flights,airports['IATA_CODE']==flights['ORIGIN_AIRPORT'])
print('Time to Join the dataframe',time.time()-start_time)


### Explain Plan on the dataframe

In [None]:
df.explain()

In [None]:
start_time = time.time()
df_broadcast = airports.join(broadcast(flights),airports['IATA_CODE']==flights['ORIGIN_AIRPORT'])
print('Time to execute',start_time-time.time())

In [None]:
df_broadcast.explain()

### Create a Temporary Table in PySpark

In [None]:
airports.createOrReplaceTempView('Airports_tbl')

my_spark.sql('Select * from Airports_tbl').show()

In [None]:
my_spark.sql('select * from airports_tbl where state="PA"').show()