# ETL stands for

# Extract: extract the data from the different sources

# Transform: Transform the unstructured data into structured data. Transformations like cleaning, manipulation, etc.

# Load : Load the transformed data into a location or date warehouse.


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, floor, rand

spark = SparkSession.builder.appName("ETLPractice").getOrCreate()

source_path = "/content/orders (1).csv"  # Correct path
target_path = "/content/order_result.csv"  # Output path

# Use the correct source path here
load_data = spark.read.csv(source_path, header=True, inferSchema=True)


In [16]:
load_data.columns
load_data.show(5)


+-------+----------+----------+----------+-----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|
+-------+----------+----------+----------+-----------+
|      1|      john|       doe|         5|     active|
|      2|      jane|     smith|         8|     active|
|      3|   micheal|   jhonson|         3|   inactive|
|      4|      abhi|   wiliams|         1|     active|
|      5|       ram|     brown|         4|   inactive|
+-------+----------+----------+----------+-----------+
only showing top 5 rows



In [17]:
 #Transformation 1: Concatenate First and Last Names
load_data = load_data.withColumn('full_name', concat(col('cust_fname'), lit(' '), col('cust_lname')))
load_data.show(10)

+-------+----------+----------+----------+-----------+---------------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|
+-------+----------+----------+----------+-----------+---------------+
|      1|      john|       doe|         5|     active|       john doe|
|      2|      jane|     smith|         8|     active|     jane smith|
|      3|   micheal|   jhonson|         3|   inactive|micheal jhonson|
|      4|      abhi|   wiliams|         1|     active|   abhi wiliams|
|      5|       ram|     brown|         4|   inactive|      ram brown|
|      6|     emily|  anderson|         2|     active| emily anderson|
|      7|   william|     jones|        10|     active|  william jones|
|      8|     susan|     davis|         7|   inactive|    susan davis|
|      9|     david|    miller|         9|     active|   david miller|
|     10|      sara|     moore|         2|   inactive|     sara moore|
+-------+----------+----------+----------+-----------+---------------+
only s

In [18]:
# Transformation 2: Calculate Net Salary (subtract 10% as taxes)
load_data = load_data.withColumn('net_salary', floor(lit(10000) + rand() * lit(50)))
load_data.show(10)

+-------+----------+----------+----------+-----------+---------------+----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|net_salary|
+-------+----------+----------+----------+-----------+---------------+----------+
|      1|      john|       doe|         5|     active|       john doe|     10044|
|      2|      jane|     smith|         8|     active|     jane smith|     10016|
|      3|   micheal|   jhonson|         3|   inactive|micheal jhonson|     10049|
|      4|      abhi|   wiliams|         1|     active|   abhi wiliams|     10002|
|      5|       ram|     brown|         4|   inactive|      ram brown|     10002|
|      6|     emily|  anderson|         2|     active| emily anderson|     10018|
|      7|   william|     jones|        10|     active|  william jones|     10012|
|      8|     susan|     davis|         7|   inactive|    susan davis|     10022|
|      9|     david|    miller|         9|     active|   david miller|     10029|
|     10|      s

In [19]:
#adding age column
load_data = load_data.withColumn('age', floor(lit(20) + rand() * lit(31)))
load_data.show(10)

+-------+----------+----------+----------+-----------+---------------+----------+---+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|net_salary|age|
+-------+----------+----------+----------+-----------+---------------+----------+---+
|      1|      john|       doe|         5|     active|       john doe|     10044| 38|
|      2|      jane|     smith|         8|     active|     jane smith|     10016| 26|
|      3|   micheal|   jhonson|         3|   inactive|micheal jhonson|     10049| 27|
|      4|      abhi|   wiliams|         1|     active|   abhi wiliams|     10002| 46|
|      5|       ram|     brown|         4|   inactive|      ram brown|     10002| 43|
|      6|     emily|  anderson|         2|     active| emily anderson|     10018| 21|
|      7|   william|     jones|        10|     active|  william jones|     10012| 43|
|      8|     susan|     davis|         7|   inactive|    susan davis|     10022| 48|
|      9|     david|    miller|         9|     active|

In [20]:
# # Transformation 3: Filter by Age (age >= 30)
load_data = load_data.filter(col('age')>= 30)
load_data.show()


+-------+----------+----------+----------+-----------+--------------+----------+---+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|     full_name|net_salary|age|
+-------+----------+----------+----------+-----------+--------------+----------+---+
|      1|      john|       doe|         5|     active|      john doe|     10044| 38|
|      4|      abhi|   wiliams|         1|     active|  abhi wiliams|     10002| 46|
|      5|       ram|     brown|         4|   inactive|     ram brown|     10002| 43|
|      7|   william|     jones|        10|     active| william jones|     10012| 43|
|      8|     susan|     davis|         7|   inactive|   susan davis|     10022| 48|
|      9|     david|    miller|         9|     active|  david miller|     10029| 44|
|     11|     james|    tailor|         5|   inactive|  james tailor|     10016| 35|
|     12|    olivia|    wilson|         3|   inactive| olivia wilson|     10003| 47|
|     13|    robert|     evans|        11|     active|  robert ev

In [21]:
# Transformation 4: Group by Age and Calculate Average Salary
avg_salary_by_age = load_data.groupBy('age').agg({'net_salary' :'avg'}).withColumnRenamed('avg(salary)', 'avg_salary')
avg_salary_by_age.show()

+---+---------------+
|age|avg(net_salary)|
+---+---------------+
| 43|        10007.0|
| 39|        10031.0|
| 48|        10021.5|
| 44|        10025.5|
| 35|        10016.0|
| 36|        10007.0|
| 38|        10044.0|
| 46|        10002.0|
| 47|        10003.0|
+---+---------------+



In [22]:
load_data = load_data.orderBy("age")
load_data.show()

+-------+----------+----------+----------+-----------+--------------+----------+---+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|     full_name|net_salary|age|
+-------+----------+----------+----------+-----------+--------------+----------+---+
|     11|     james|    tailor|         5|   inactive|  james tailor|     10016| 35|
|     13|    robert|     evans|        11|     active|  robert evans|     10007| 36|
|      1|      john|       doe|         5|     active|      john doe|     10044| 38|
|     16|  isabella|     white|         6|   inactive|isabella white|     10031| 39|
|      5|       ram|     brown|         4|   inactive|     ram brown|     10002| 43|
|      7|   william|     jones|        10|     active| william jones|     10012| 43|
|      9|     david|    miller|         9|     active|  david miller|     10029| 44|
|     17|    joseph|    martin|         4|   inactive| joseph martin|     10022| 44|
|      4|      abhi|   wiliams|         1|     active|  abhi wili

In [23]:
# Save the transformed data to an external CSV file
load_data.write.csv(target_path, mode='overwrite', header=True)

Overview
This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. DBFS is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in Python so the default cell type is Python. However, you can use different languages by using the %LANGUAGE syntax. Python, Scala, SQL, and R are all supported.

In [2]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("ETLPractice").getOrCreate()

# File location and type
file_location = "/content/small_zipcode.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# Read CSV
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

# Create temp view
df.createOrReplaceTempView("tempdata")

# Show the data
df.show()


+---+-------+--------+-------------------+-----+----------+
|_c0|    _c1|     _c2|                _c3|  _c4|       _c5|
+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+



In [3]:
spark.sql("select * from tempdata").show()
df.select("_c0","_c1").show(5)

+---+-------+--------+-------------------+-----+----------+
|_c0|    _c1|     _c2|                _c3|  _c4|       _c5|
+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+

+---+-------+
|_c0|    _c1|
+---+-------+
| id|zipcode|
|  1|    704|
|  2|    704|
|  3|    709|
|  4|  76166|
+---+-------+
only showing top 5 rows



In [4]:
spark.sql("""SELECT * From tempdata WHERE _c4='AZ'""").show(5)

+---+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|_c5|
+---+---+---+---+---+---+
+---+---+---+---+---+---+



In [8]:
from pyspark.sql import SparkSession

# Create Spark session (if not already done)
spark = SparkSession.builder.appName("ETLPractice").getOrCreate()

# File location and type
file_location = "/content/small_zipcode.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# Read CSV file
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

# Display the dataframe
df.show()

# Register as temporary view
df.createOrReplaceTempView("customer")


+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+



In [10]:
spark.sql("select * from customer").show()
df.select("id", "state").show(5)

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+

+---+-----+
| id|state|
+---+-----+
|  1|   PR|
|  2|   PR|
|  3|   PR|
|  4|   TX|
|  5|   TX|
+---+-----+



In [11]:
spark.sql("""SELECT * From customer WHERE state='PR'""").show(5)

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
+---+-------+--------+-------------------+-----+----------+



In [12]:
spark.sql("""select * FROM customer WHERE state in ('PR','AZ','FL')order by state """).show(10
                                                                                            )

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
+---+-------+--------+-------------------+-----+----------+



In [13]:
spark.sql("""SELECT state,count(*) as count FROM customer GROUP BY state""").show()

+-----+-----+
|state|count|
+-----+-----+
|   TX|    2|
|   PR|    3|
+-----+-----+

