### Creates the dataframe

In [2]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

data = [(1,"Ram","Admin",10000),
        (2,"Fareed","Manager",15000),
        (3,"Sharukh","Sales",10000),
        (4,"Aravind","Sales",20000),
        (5,"Seenu","HR",5000),
        (6,"Somu","Manager",25000),
        (7,"Mohamed","Engineer",30000),
        (8,"Aflal","Manager",17000),
        (9,"Abdul","CEO",50000),
        (10,"Zara","Admin",28000)
]

columns = StructType([
    StructField("Emp_id" , IntegerType() , True) ,
    StructField("Ename" , StringType() , True) ,
    StructField("JobDesc" , StringType() , True) ,
    StructField("Salary" , IntegerType() , True) 
])

df = spark.createDataFrame(data=data , schema=columns)
df.show()
# display(df)

StatementMeta(, 48f0b169-0a3c-4d9e-b2ae-752540b01924, 4, Finished, Available, Finished)

+------+-------+--------+------+
|Emp_id|  Ename| JobDesc|Salary|
+------+-------+--------+------+
|     1|    Ram|   Admin| 10000|
|     2| Fareed| Manager| 15000|
|     3|Sharukh|   Sales| 10000|
|     4|Aravind|   Sales| 20000|
|     5|  Seenu|      HR|  5000|
|     6|   Somu| Manager| 25000|
|     7|Mohamed|Engineer| 30000|
|     8|  Aflal| Manager| 17000|
|     9|  Abdul|     CEO| 50000|
|    10|   Zara|   Admin| 28000|
+------+-------+--------+------+



### Distinct

In [5]:
# display the JobDesc (remove duplicate values)
df.select("JobDesc").distinct().show()

StatementMeta(, 48f0b169-0a3c-4d9e-b2ae-752540b01924, 7, Finished, Available, Finished)

+--------+
| JobDesc|
+--------+
|   Admin|
| Manager|
|   Sales|
|      HR|
|Engineer|
|     CEO|
+--------+



#### Limit

In [11]:
# display the first two rows
df.show(2)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 13, Finished, Available, Finished)

+------+------+-------+------+
|Emp_id| Ename|JobDesc|Salary|
+------+------+-------+------+
|     1|   Ram|  Admin| 10000|
|     2|Fareed|Manager| 15000|
+------+------+-------+------+
only showing top 2 rows



In [18]:
# display the last two rows
display(df.tail(2))

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 20, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a74ef4c9-70c7-4149-bcd8-d706da0370d5)

#### Filter

In [8]:
# display the particular person id
fareed_id = df.select("Emp_id").filter(df["Ename"]=="Fareed")
fareed_id.show()
#display(fareed_id)


StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 10, Finished, Available, Finished)

+------+
|Emp_id|
+------+
|     2|
+------+



In [97]:
# display the particular salary
sharukh_salary = df.select("Salary").filter(df["Ename"]=="Sharukh")
sharukh_salary.show()
# display(sharukh_salary)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 99, Finished, Available, Finished)

+------+
|Salary|
+------+
| 10000|
+------+



In [89]:
# display the members by salary between 1000 and 20000
filter_df = df.select("*").filter((df["Salary"]>=10000) & (df["Salary"]<=20000))
filter_df.show()


StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 91, Finished, Available, Finished)

+------+-------+-------+------+
|Emp_id|  Ename|JobDesc|Salary|
+------+-------+-------+------+
|     1|    Ram|  Admin| 10000|
|     2| Fareed|Manager| 15000|
|     3|Sharukh|  Sales| 10000|
|     4|Aravind|  Sales| 20000|
|     8|  Aflal|Manager| 17000|
+------+-------+-------+------+



#### Like Operator

In [19]:
# Name starts with 'A'
name_starts_A = df.select("Ename").filter(df["Ename"].like("A%"))
name_starts_A.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 21, Finished, Available, Finished)

+-------+
|  Ename|
+-------+
|Aravind|
|  Aflal|
|  Abdul|
+-------+



In [20]:
# Name end with 'd'
name_ends_with_d = df.select("Ename").filter(df["Ename"].like("%d"))
name_ends_with_d.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 22, Finished, Available, Finished)

+-------+
|  Ename|
+-------+
| Fareed|
|Aravind|
|Mohamed|
+-------+



In [26]:
# Name has 'a' in 2
name_2_a = df.select("Ename").filter(df["Ename"].like("_a%"))
name_2_a.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 28, Finished, Available, Finished)

+------+
| Ename|
+------+
|   Ram|
|Fareed|
|  Zara|
+------+



In [31]:
# Name starts with 'A' and ends with 'l'
name_start_end = df.select("Ename").filter(df["ename"].like("A%l"))
name_start_end.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 33, Finished, Available, Finished)

+-----+
|Ename|
+-----+
|Aflal|
|Abdul|
+-----+



#### orderBy 

In [36]:
order_name = df.select("Ename", "Salary").orderBy(df["Ename"])
order_name.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 38, Finished, Available, Finished)

+-------+------+
|  Ename|Salary|
+-------+------+
|  Abdul| 50000|
|  Aflal| 17000|
|Aravind| 20000|
| Fareed| 15000|
|Mohamed| 30000|
|    Ram| 10000|
|  Seenu|  5000|
|Sharukh| 10000|
|   Somu| 25000|
|   Zara| 28000|
+-------+------+



In [37]:
order_salary = df.select("Ename", "Salary").orderBy(df["Salary"])
order_salary.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 39, Finished, Available, Finished)

+-------+------+
|  Ename|Salary|
+-------+------+
|  Seenu|  5000|
|Sharukh| 10000|
|    Ram| 10000|
| Fareed| 15000|
|  Aflal| 17000|
|Aravind| 20000|
|   Somu| 25000|
|   Zara| 28000|
|Mohamed| 30000|
|  Abdul| 50000|
+-------+------+



In [90]:
# Descending the salary
from pyspark.sql.functions import col , when
des_order_salary = df.select("Ename", "Salary").orderBy(col("Salary").desc())
des_order_salary.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 92, Finished, Available, Finished)

+-------+------+
|  Ename|Salary|
+-------+------+
|  Abdul| 50000|
|Mohamed| 30000|
|   Zara| 28000|
|   Somu| 25000|
|Aravind| 20000|
|  Aflal| 17000|
| Fareed| 15000|
|Sharukh| 10000|
|    Ram| 10000|
|  Seenu|  5000|
+-------+------+



In [44]:
# display the topmost 5 salary
salary = df.select("Ename", "Salary").orderBy(col("Salary").desc())
salary.show(5)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 46, Finished, Available, Finished)

+-------+------+
|  Ename|Salary|
+-------+------+
|  Abdul| 50000|
|Mohamed| 30000|
|   Zara| 28000|
|   Somu| 25000|
|Aravind| 20000|
+-------+------+
only showing top 5 rows



#### count

In [56]:
# display the number of rows
total_rows = df.select("*").count()
print("Number of rows :" , total_rows)
# display(total_rows)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 58, Finished, Available, Finished)

Number of rows : 10


In [70]:
sales_job = df.select("JobDesc").filter(df["JobDesc"]=="Sales").count()
print("Sales count :" , sales_job)
# display(sales_job)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 72, Finished, Available, Finished)

Sales count : 2


In [71]:
manager_job = df.select("JobDesc").filter(df["JobDesc"]=="Manager").count()
print("Manager count :" , manager_job)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 73, Finished, Available, Finished)

Manager count : 3


#### GroupBy

In [80]:
group_job = df.select("JobDesc").groupBy("JobDesc").count()
group_job.show()
#dn = group_job.select("JobDesc", "count").orderBy(group_job["count"])
#display(group_job)

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 82, Finished, Available, Finished)

+--------+-----+
| JobDesc|count|
+--------+-----+
|   Admin|    2|
| Manager|    3|
|   Sales|    2|
|      HR|    1|
|Engineer|    1|
|     CEO|    1|
+--------+-----+



#### Isin or Contain

In [84]:
one_name = df.select("*").filter(df["Ename"].contains("Aflal"))
one_name.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 86, Finished, Available, Finished)

+------+-----+-------+------+
|Emp_id|Ename|JobDesc|Salary|
+------+-----+-------+------+
|     8|Aflal|Manager| 17000|
+------+-----+-------+------+



In [81]:
names = ["Aravind" , "Seenu"]
two_names = df.select("*").filter(df["Ename"].isin(names))
two_names.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 83, Finished, Available, Finished)

+------+-------+-------+------+
|Emp_id|  Ename|JobDesc|Salary|
+------+-------+-------+------+
|     4|Aravind|  Sales| 20000|
|     5|  Seenu|     HR|  5000|
+------+-------+-------+------+



#### Add the column in Existing dataframe

In [96]:

city_added = df.withColumn( 
                "City" ,
                when(col("JobDesc")=="Admin","Chennai")
               .when(col("JobDesc")=="Manager","Trichy")
               .when(col("JobDesc")=="Sales","Madurai")
               .when(col("JobDesc")=="CEO","Sirkali")
               .when(col("JobDesc")=="Engineer","Chidambaram")
               .when(col("JobDesc")=="HR","Mayiladuthurai")
               .otherwise("unknown")
             )
city_added.show()

StatementMeta(, eee6c19f-2d0d-4a7f-b3d7-e3ebfeba16a4, 98, Finished, Available, Finished)

+------+-------+--------+------+--------------+
|Emp_id|  Ename| JobDesc|Salary|          City|
+------+-------+--------+------+--------------+
|     1|    Ram|   Admin| 10000|       Chennai|
|     2| Fareed| Manager| 15000|        Trichy|
|     3|Sharukh|   Sales| 10000|       Madurai|
|     4|Aravind|   Sales| 20000|       Madurai|
|     5|  Seenu|      HR|  5000|Mayiladuthurai|
|     6|   Somu| Manager| 25000|        Trichy|
|     7|Mohamed|Engineer| 30000|   Chidambaram|
|     8|  Aflal| Manager| 17000|        Trichy|
|     9|  Abdul|     CEO| 50000|       Sirkali|
|    10|   Zara|   Admin| 28000|       Chennai|
+------+-------+--------+------+--------------+

