In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [2]:
spark=SparkSession.builder.master("local[*]").appName("demo").getOrCreate()
spark

23/07/03 10:44:13 WARN Utils: Your hostname, ZSCHN01LP0253L resolves to a loopback address: 127.0.1.1; using 192.168.98.237 instead (on interface wlp0s20f3)
23/07/03 10:44:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/03 10:44:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data=[("Pradeep",1),("Radee",2)]
schema=StructType([\
                   StructField("Name",StringType(),True),\
                   StructField("Id",IntegerType(),True)
                  ])
data = spark.createDataFrame(data=data,schema=schema)
data.printSchema()


root
 |-- Name: string (nullable = true)
 |-- Id: integer (nullable = true)



In [5]:
data.show(truncate=True)

+-------+---+
|   Name| Id|
+-------+---+
|Pradeep|  1|
|  Radee|  2|
+-------+---+



In [7]:
data=spark.read.option("header",True).option("inferSchema",True).csv("../Data/employees.csv")
data.show(1)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 1 row



In [8]:
data.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [9]:
data.select("EMPLOYEE_ID","SALARY").show(1)

+-----------+------+
|EMPLOYEE_ID|SALARY|
+-----------+------+
|        198|  2600|
+-----------+------+
only showing top 1 row



In [10]:
data.select(data.EMPLOYEE_ID,data.SALARY).show(1)

+-----------+------+
|EMPLOYEE_ID|SALARY|
+-----------+------+
|        198|  2600|
+-----------+------+
only showing top 1 row



In [11]:
data.select(data["EMPLOYEE_ID"],data["SALARY"]).show(1)

+-----------+------+
|EMPLOYEE_ID|SALARY|
+-----------+------+
|        198|  2600|
+-----------+------+
only showing top 1 row



In [13]:
from pyspark.sql.functions import col
data.select(col("EMPLOYEE_ID").alias("eid"),col("SALARY")).show(1)

+---+------+
|eid|SALARY|
+---+------+
|198|  2600|
+---+------+
only showing top 1 row



In [16]:
data.withColumn("NewSal",col("SALARY")+100).select("EMPLOYEE_ID","SALARY","NewSal").show(1)

+-----------+------+------+
|EMPLOYEE_ID|SALARY|NewSal|
+-----------+------+------+
|        198|  2600|  2700|
+-----------+------+------+
only showing top 1 row



In [17]:
data.withColumnRenamed("SALARY","NewSal").select("*").show(1)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|NewSal|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 1 row



In [18]:
data.drop("SALARY","COMMISSION_PCT").show(1)

+-----------+----------+---------+--------+------------+---------+--------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+----------+-------------+
only showing top 1 row



In [24]:
data.filter("SALARY != 5000").show(1)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 1 row



In [23]:
data.filter(data.SALARY != 5000).show(1)
data.filter(data["SALARY"] != 5000).show(1)
data.filter(col("SALARY") != 5000).show(1)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 1 row

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------

In [26]:
data.distinct().show(5)

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        120|   Matthew|    Weiss|  MWEISS|650.123.1234|18-JUL-04|    ST_MAN|  8000|            - |       100|           50|
|        118|       Guy|   Himuro| GHIMURO|515.127.4565|15-NOV-06|  PU_CLERK|  2600|            - |       114|           30|
|        110|      John|     Chen|   JCHEN|515.124.4269|28-SEP-05|FI_ACCOUNT|  8200|            - |       108|          100|
|        123|    Shanta|  Vollman|SVOLLMAN|650.123.4234|10-OCT-05|    ST_MAN|  6500|            - |       100|           50|
|        124|     Kevin|  Mourgos|KMOURGOS|650.123.5234|16-NOV-07|    ST_MAN|  5800|            - |       100|           50|


In [27]:
data.dropDuplicates(["EMPLOYEE_ID","FIRST_NAME"]).show(5)

+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|17-JUN-03|AD_PRES| 24000|            - |        - |           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|21-SEP-05|  AD_VP| 17000|            - |       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|13-JAN-01|  AD_VP| 17000|            - |       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|03-JAN-06|IT_PROG|  9000|            - |       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|21-MAY-07|IT_PROG|  6000|            - |       103|           60|
+-----------+----------+

In [36]:
from pyspark.sql.functions import *
data.select(count("SALARY").alias("Avgsal")).show(5)

+------+
|Avgsal|
+------+
|    50|
+------+



In [39]:
data.select("SALARY").orderBy(col("SALARY").desc()).show(5)

+------+
|SALARY|
+------+
| 24000|
| 17000|
| 17000|
| 13000|
| 12008|
+------+
only showing top 5 rows



In [43]:
data.groupBy("EMPLOYEE_ID").agg(max("SALARY").alias("MaxSal")).show()

+-----------+------+
|EMPLOYEE_ID|MaxSal|
+-----------+------+
|        137|  3600|
|        133|  3300|
|        108| 12008|
|        101| 17000|
|        115|  3100|
|        126|  2700|
|        103|  9000|
|        128|  2200|
|        122|  7900|
|        111|  7700|
|        140|  2500|
|        132|  2100|
|        206|  8300|
|        205| 12008|
|        139|  2700|
|        120|  8000|
|        117|  2800|
|        112|  7800|
|        127|  2400|
|        202|  6000|
+-----------+------+
only showing top 20 rows



In [54]:
data.withColumn("NewSal",when(col("SALARY") < 5000,1).when((col("SALARY")>5000) & (col("SALARY")<10000),2).otherwise(3)).select("SALARY","NewSal").show(10)

+------+------+
|SALARY|NewSal|
+------+------+
|  2600|     1|
|  2600|     1|
|  4400|     1|
| 13000|     3|
|  6000|     2|
|  6500|     2|
| 10000|     3|
| 12008|     3|
|  8300|     2|
| 24000|     3|
+------+------+
only showing top 10 rows



In [60]:
data.createOrReplaceTempView("emplyoee1")
spark.sql("select * from emplyoee1").show(5)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03| AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|  MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|  MK_REP|  6000|            - |       201|           20|
+-----------+---

In [61]:
emp=spark.read.option("header",True).option("inferSchema",True).csv("../Data/employees.csv")
emp.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [62]:
dept=spark.read.option("header",True).option("inferSchema",True).csv("../Data/departments.csv")
dept.printSchema()

root
 |-- DEPARTMENT_ID: integer (nullable = true)
 |-- DEPARTMENT_NAME: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- LOCATION_ID: integer (nullable = true)



In [70]:
emp.join(dept,emp.DEPARTMENT_ID == dept.DEPARTMENT_ID,'full').show(1)

+-----------+----------+---------+-------+------------+---------+-------+------+--------------+----------+-------------+-------------+---------------+----------+-----------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|  EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|DEPARTMENT_ID|DEPARTMENT_NAME|MANAGER_ID|LOCATION_ID|
+-----------+----------+---------+-------+------------+---------+-------+------+--------------+----------+-------------+-------------+---------------+----------+-----------+
|        200|  Jennifer|   Whalen|JWHALEN|515.123.4444|17-SEP-03|AD_ASST|  4400|            - |       101|           10|           10| Administration|       200|       1700|
+-----------+----------+---------+-------+------------+---------+-------+------+--------------+----------+-------------+-------------+---------------+----------+-----------+
only showing top 1 row



In [77]:
@udf(returnType=StringType())
def uppercase(value):
    value = value.upper()
    return value



In [79]:
data.select("FIRST_NAME",uppercase("LAST_NAME").alias("ucase")).show(5)

+----------+---------+
|FIRST_NAME|    ucase|
+----------+---------+
|    Donald| OCONNELL|
|   Douglas|    GRANT|
|  Jennifer|   WHALEN|
|   Michael|HARTSTEIN|
|       Pat|      FAY|
+----------+---------+
only showing top 5 rows

