In [0]:
# write a PySpark programme to select every 3rd row in the dataset

In [0]:
from pyspark.sql import SparkSession

In [0]:
sparky=SparkSession.builder.appName('3rd row').getOrCreate()

In [0]:
from pyspark.sql.types import *

In [0]:
schema=StructType().add(field='emp_id',data_type=IntegerType(),nullable=True)\
    .add(field='name',data_type=StringType(),nullable=True)\
        .add(field='salary',data_type=IntegerType(),nullable=True)

In [0]:
data=[ (1001, "John Doe", 50000),
       (2001, "Jane Smith", 60000),
       (1003, "Michael Johnson", 75000),
       (4000, "Emily Davis", 55000),
       (1005, "Robert Brown", 70000),
       (6000, "Emma Wilson", 80000),
       (1700, "James Taylor", 65000),
       (8000, "Olivia Martinez", 72000),
       (2900, "William Anderson", 68000),
       (3310, "Sophia Garcia", 67000)]

In [0]:
df=sparky.createDataFrame(data=data,schema=schema)
df.show()

+------+----------------+------+
|emp_id|            name|salary|
+------+----------------+------+
|  1001|        John Doe| 50000|
|  2001|      Jane Smith| 60000|
|  1003| Michael Johnson| 75000|
|  4000|     Emily Davis| 55000|
|  1005|    Robert Brown| 70000|
|  6000|     Emma Wilson| 80000|
|  1700|    James Taylor| 65000|
|  8000| Olivia Martinez| 72000|
|  2900|William Anderson| 68000|
|  3310|   Sophia Garcia| 67000|
+------+----------------+------+



In [0]:
# functionality - window function,row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,monotonically_increasing_id

In [0]:
window_spec=Window.orderBy(monotonically_increasing_id())
df=df.withColumn('row_number',row_number().over(window_spec))
df.show()

+------+----------------+------+----------+
|emp_id|            name|salary|row_number|
+------+----------------+------+----------+
|  1001|        John Doe| 50000|         1|
|  2001|      Jane Smith| 60000|         2|
|  1003| Michael Johnson| 75000|         3|
|  4000|     Emily Davis| 55000|         4|
|  1005|    Robert Brown| 70000|         5|
|  6000|     Emma Wilson| 80000|         6|
|  1700|    James Taylor| 65000|         7|
|  8000| Olivia Martinez| 72000|         8|
|  2900|William Anderson| 68000|         9|
|  3310|   Sophia Garcia| 67000|        10|
+------+----------------+------+----------+



In [0]:
df1=df.filter(df['row_number']%3==0)

In [0]:
df1.show()

+------+----------------+------+----------+
|emp_id|            name|salary|row_number|
+------+----------------+------+----------+
|  1003| Michael Johnson| 75000|         3|
|  6000|     Emma Wilson| 80000|         6|
|  2900|William Anderson| 68000|         9|
+------+----------------+------+----------+

