
https://www.machinelearningplus.com/pyspark/pyspark-exercises-101-pyspark-exercises-for-data-analysis/

In [0]:
import pyspark
from pyspark.sql import SparkSession

In [0]:
spark=SparkSession.builder.master('local[*]').appName('test').getOrCreate()

In [0]:
df=spark.createDataFrame([('Alice', 1), ('Bob', 2), ('Charlie', 3)], ['Name', 'Value'])
df.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [0]:
from pyspark.sql.functions import lit
df =df.withColumn('III', lit('USA'))
df.show()

+-------+-----+---+
|   Name|Value|III|
+-------+-----+---+
|  Alice|    1|USA|
|    Bob|    2|USA|
|Charlie|    3|USA|
+-------+-----+---+




How to combine many lists to form a PySpark DataFrame?


In [0]:
SampleDepartment = [("Finance",20),("Accounts", 60),("Marketing",30),("Sales",40),("IT",50)]
rdd = spark.sparkContext.parallelize(SampleDepartment)
df=rdd.toDF()
df.show()
df.collect()

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 20|
| Accounts| 60|
|Marketing| 30|
|    Sales| 40|
|       IT| 50|
+---------+---+

Out[16]: [Row(_1='Finance', _2=20),
 Row(_1='Accounts', _2=60),
 Row(_1='Marketing', _2=30),
 Row(_1='Sales', _2=40),
 Row(_1='IT', _2=50)]

4. How to get the items of list A not present in list B?


In [0]:
rdd1 = spark.sparkContext.parallelize([1,2,3,4,5])
rdd2 = spark.sparkContext.parallelize([4,5,6,7,8])
rdd1.subtract(rdd2).collect()

Out[19]: [1, 2, 3]


5. How to get the items not common to both list A and list B?


In [0]:
rdd1_2 = rdd1.subtract(rdd2)
print(rdd1_2.collect())
rdd2_1 = rdd2.subtract(rdd1)
print(rdd2_1.collect())

rdd_not_common = rdd1_2.union(rdd2_1)
rdd_not_common.collect()

[1, 2, 3]
[6, 7, 8]
Out[24]: [1, 2, 3, 6, 7, 8]

8. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?


In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col, when

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)

# show DataFrame
df.show()



top2_jobs = df.groupBy('job').count().orderBy('count', ascending=False).limit(2).select('job').rdd.flatMap(lambda x:x).collect()
top2_jobs

df.withColumn('job', when(col('job').isin(top2_jobs), col('job')).otherwise('Other'))
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+



9. How to Drop rows with NA values specific to a particular column?


In [0]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

df.dropna(subset=['Value', 'id']).show()

# df.show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|null|
|   B| null| 123|
|   B|    3| 456|
|   D| null|null|
+----+-----+----+

+----+-----+---+
|Name|Value| id|
+----+-----+---+
|   B|    3|456|
+----+-----+---+



In [0]:
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]


for old_name, new_name in zip(old_names, new_names):
    print(old_name, new_name)
    df = df.withColumnRenamed(old_name, new_name)

df.show()

col1 new_col1
col2 new_col2
col3 new_col3
+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



13. How to find the numbers that are multiples of 3 from a column?


In [0]:
from pyspark.sql.functions import rand, when

# Generate a DataFrame with a single column "id" with 10 rows
df = spark.range(10)

# Generate a random float between 0 and 1, scale and shift it to get a random integer between 1 and 10
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))

# Show the DataFrame
df.show()
df = df.withColumn('is_multiple_of_3', when(col('random')%3 ==0, 1).otherwise(0))

df.show()

+---+------+
| id|random|
+---+------+
|  0|     7|
|  1|     9|
|  2|     8|
|  3|     8|
|  4|     3|
|  5|     1|
|  6|     7|
|  7|     4|
|  8|     5|
|  9|     1|
+---+------+

+---+------+----------------+
| id|random|is_multiple_of_3|
+---+------+----------------+
|  0|     7|               0|
|  1|     9|               1|
|  2|     8|               0|
|  3|     8|               0|
|  4|     3|               1|
|  5|     1|               0|
|  6|     7|               0|
|  7|     4|               0|
|  8|     5|               0|
|  9|     1|               0|
+---+------+----------------+




14. How to extract items at given positions from a column?


In [0]:
from pyspark.sql.functions import rand

# Generate a DataFrame with a single column "id" with 10 rows
df = spark.range(10)

# Generate a random float between 0 and 1, scale and shift it to get a random integer between 1 and 10
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))

# Show the DataFrame
df.show()

pos = [0, 4, 8, 5]



+---+------+
| id|random|
+---+------+
|  0|     7|
|  1|     9|
|  2|     8|
|  3|     8|
|  4|     3|
|  5|     1|
|  6|     7|
|  7|     4|
|  8|     5|
|  9|     1|
+---+------+



In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from pyspark.sql.functions import row_number, monotonically_increasing_id

windows = Window.orderBy(monotonically_increasing_id())

df = df.withColumn('index', row_number().over(windows) - 1)

df.show()

df_filtered = df.filter(df.index.isin(pos))
df_filtered.show()

+---+------+-----+
| id|random|index|
+---+------+-----+
|  0|     7|    0|
|  1|     9|    1|
|  2|     8|    2|
|  3|     8|    3|
|  4|     3|    4|
|  5|     1|    5|
|  6|     7|    6|
|  7|     4|    7|
|  8|     5|    8|
|  9|     1|    9|
+---+------+-----+

+---+------+-----+
| id|random|index|
+---+------+-----+
|  0|     7|    0|
|  4|     3|    4|
|  5|     1|    5|
|  8|     5|    8|
+---+------+-----+

