In [0]:
# Load the CSV file
file_path = "/FileStore/tables/customers.csv"  # Update this path as needed

In [0]:
# Load the CSV file into a DataFrame
customersDF = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the first few rows
customersDF.show(5)

+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|employee_id|first_name|last_name|   email|phone_number| hire_date| job_id| salary|commission_pct|manager_id|department_id|
+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|1987-06-17|AD_PRES|24000.0|          NULL|      NULL|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|1989-09-21|  AD_VP|17000.0|          NULL|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|1993-01-13|  AD_VP|17000.0|          NULL|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|1990-01-03|IT_PROG| 9000.0|          NULL|       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|1991-05-21|IT_PROG| 6000.0|          NULL|       103|           60|
+-------

In [0]:
# Register as a SQL table
customersDF.createOrReplaceTempView("customers")

In [0]:
# %sql is a notebook magic command that enables you to execute SQL queries on tables directly in a cell.

In [0]:


%sql
SELECT employee_id, first_name, last_name, salary 
FROM customers 
WHERE salary > 10000;

employee_id,first_name,last_name,salary
100,Steven,King,24000.0
101,Neena,Kochhar,17000.0
102,Lex,De Haan,17000.0
108,Nancy,Greenberg,12000.0
114,Den,Raphaely,11000.0
145,John,Russell,14000.0
146,Karen,Partners,13500.0
147,Alberto,Errazuriz,12000.0
148,Gerald,Cambrault,11000.0
149,Eleni,Zlotkey,10500.0


In [0]:
query1 = spark.sql("SELECT employee_id, first_name, last_name, salary FROM customers WHERE salary > 10000")
query1.show()

+-----------+----------+---------+-------+
|employee_id|first_name|last_name| salary|
+-----------+----------+---------+-------+
|        100|    Steven|     King|24000.0|
|        101|     Neena|  Kochhar|17000.0|
|        102|       Lex|  De Haan|17000.0|
|        108|     Nancy|Greenberg|12000.0|
|        114|       Den| Raphaely|11000.0|
|        145|      John|  Russell|14000.0|
|        146|     Karen| Partners|13500.0|
|        147|   Alberto|Errazuriz|12000.0|
|        148|    Gerald|Cambrault|11000.0|
|        149|     Eleni|  Zlotkey|10500.0|
|        162|     Clara|  Vishney|10500.0|
|        168|      Lisa|     Ozer|11500.0|
|        174|     Ellen|     Abel|11000.0|
|        201|   Michael|Hartstein|13000.0|
|        205|   Shelley|  Higgins|12000.0|
+-----------+----------+---------+-------+



In [0]:
%sql
SELECT department_id, COUNT(*) AS employee_count 
FROM customers 
GROUP BY department_id;

department_id,employee_count
30.0,6
110.0,2
100.0,7
70.0,1
90.0,4
60.0,5
40.0,1
20.0,2
10.0,1
80.0,32


In [0]:
query2 = spark.sql("SELECT department_id, COUNT(*) AS employee_count FROM customers GROUP BY department_id")
query2.show()

+-------------+--------------+
|department_id|employee_count|
+-------------+--------------+
|           30|             6|
|          110|             2|
|          100|             7|
|           70|             1|
|           90|             4|
|           60|             5|
|           40|             1|
|           20|             2|
|           10|             1|
|           80|            32|
|         NULL|             1|
|           50|            45|
+-------------+--------------+



In [0]:
%sql
SELECT employee_id, first_name, last_name, salary 
FROM customers 
ORDER BY salary DESC 
LIMIT 5;


employee_id,first_name,last_name,salary
100,Steven,King,24000.0
101,Neena,Kochhar,17000.0
102,Lex,De Haan,17000.0
145,John,Russell,14000.0
146,Karen,Partners,13500.0


In [0]:
# Count the number of employees hired in each year.

In [0]:
%sql
SELECT YEAR(hire_date) AS hire_year, COUNT(*) AS employee_count 
FROM customers 
GROUP BY YEAR(hire_date) 
ORDER BY hire_year;

hire_year,employee_count
1987,2
1989,1
1990,1
1991,1
1993,1
1994,7
1995,4
1996,10
1997,28
1998,23
