In [0]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

#creating SparkSession

spark = SparkSession.builder.appName("Spark Dataframe").getOrCreate()

In [0]:
%fs ls /FileStore/tables/Prajwal 

In [0]:
#importing flight data

df = spark.read.format("csv").option("header", "true").option("inferschema","true").load("/FileStore/tables/Prajwal/flights.csv")

display(df)



In [0]:
#view schema
df.printSchema()

In [0]:
df.columns   #printing only column names

In [0]:
df = df.withColumn("airport", lower(col("DESTINATION_AIRPORT"))) #creating new column airport
df.select("AIRLINE","airport").show()  #selecting only two columns

In [0]:
#dropping the DESTINATION_AIRPORT column
df = df.drop("DESTINATION_AIRPORT")
display(df)

In [0]:
#to write sql query we are creating a temporary view
df.createOrReplaceTempView("flights")

In [0]:
#display the flight duration which is lessthen 50
short_flights = spark.sql("SELECT * FROM flights WHERE AIR_TIME < 50 ")
display(short_flights.distinct().count())

In [0]:
%sql

SELECT  DISTINCT(COUNT(AIRLINE)) FROM flights where AIR_TIME < 50;

In [0]:
%sql
-- Chart Visualisation using Magic SQL

SELECT distinct DAY,ORIGIN_AIRPORT,count(AIRLINE) FROM flights group by DAY,ORIGIN_AIRPORT

Databricks visualization. Run in Databricks to view.

In [0]:
#fliter data for first 2 days
df.filter("DAY < 3").show()

In [0]:
#adding new row with withcolumn
df = df.withColumn("ROW_ID", monotonically_increasing_id())
display(df.select("ROW_ID"))

In [0]:
#changing row names on conditions
# here we are using CANCELED column to change row names on conditon: if value is 0 set it as 'CANCELED' or 'NOT CANCELED'

df = df.withColumn("CANCELLED", when(col("CANCELLED") == 0 , "CANCELLED").otherwise("NOT CANCELLED"))
display(df.select("CANCELLED"))

In [0]:
#remove null from AIRLINE_DELAY
df = df.filter(col("AIRLINE_DELAY").isNotNull())
print(df)

In [0]:
# Drop Rows with NULL Values on Selected Columns, show only first record
df.na.drop(subset=["WEATHER_DELAY"]).show(1)

# Drop Rows with NULL Values on All Columns
df.na.drop("all").show(1)

In [0]:
#remove duplicates by using example dataset
simpleData = [("James", "Sales", 3000),
  ("Michael", "Sales", 4600),
  ("Robert", "Sales", 4100),
  ("Maria", "Finance", 3000),
  ("James", "Sales", 3000),
  ("Scott", "Finance", 3300),
  ("Jen", "Finance", 3900),
  ("Jeff", "Marketing", 3000),
  ("Kumar", "Marketing", 2000),
  ("Saif", "Sales", 4100)]

datadf = spark.createDataFrame(simpleData, ["name", "department", "salary"])
display(datadf)

In [0]:
datadf = datadf.dropDuplicates(["department","salary"]) #removing duplicates using dropsuplicates 
display(datadf)

In [0]:
datadf = datadf.withColumnRenamed("name","employee_name") #remaned column
display(datadf)

In [0]:
#finding the avg salary from each department
display(datadf.groupBy("department").avg("salary"))


In [0]:
%fs ls FileStore/tables/Prajwal/department.json


In [0]:
#importing StruckType and StruckField to define schema for json file
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
%python
# defining input path
input_file = 'dbfs:/FileStore/tables/Prajwal/department.json'

# defining schema for json file
schema = StructType([
    StructField("dept", StringType(), True),
    StructField("ssn", StringType(), True)
])

# loading json file with schema
json_df = spark.read.json(input_file, schema=schema)

display(json_df)

In [0]:
#