In [1]:

# Create a sample DataFrame with null values
data = [
    (1, "Alice", 29, "New York"),
    (2, "Bob", None, "San Francisco"),
    (3, None, 35, None),
    (4, "David", None, "Seattle"),
    (5, "Eve", 30, None)
]

columns = ["id", "name", "age", "city"]

df = spark.createDataFrame(data, columns)

# Show the original DataFrame
print("Original DataFrame:")
df.show()

# Drop rows with any null values
df_drop_any = df.dropna()

print("DataFrame after dropping rows with any null values:")
df_drop_any.show()

# Drop rows with nulls in specific columns (e.g., "age" and "city")
df_drop_specific = df.dropna(subset=["age", "city"])

print("DataFrame after dropping rows with nulls in specific columns (age, city):")
df_drop_specific.show()

# Drop rows where all columns are null
df_drop_all_nulls = df.dropna(how="all")

print("DataFrame after dropping rows where all columns are null:")
df_drop_all_nulls.show()

Original DataFrame:


                                                                                

+---+-----+----+-------------+
| id| name| age|         city|
+---+-----+----+-------------+
|  1|Alice|  29|     New York|
|  2|  Bob|NULL|San Francisco|
|  3| NULL|  35|         NULL|
|  4|David|NULL|      Seattle|
|  5|  Eve|  30|         NULL|
+---+-----+----+-------------+

DataFrame after dropping rows with any null values:
+---+-----+---+--------+
| id| name|age|    city|
+---+-----+---+--------+
|  1|Alice| 29|New York|
+---+-----+---+--------+

DataFrame after dropping rows with nulls in specific columns (age, city):
+---+-----+---+--------+
| id| name|age|    city|
+---+-----+---+--------+
|  1|Alice| 29|New York|
+---+-----+---+--------+

DataFrame after dropping rows where all columns are null:
+---+-----+----+-------------+
| id| name| age|         city|
+---+-----+----+-------------+
|  1|Alice|  29|     New York|
|  2|  Bob|NULL|San Francisco|
|  3| NULL|  35|         NULL|
|  4|David|NULL|      Seattle|
|  5|  Eve|  30|         NULL|
+---+-----+----+-------------+



In [29]:
data = [
    (1, "Alice", 29, "New York"),
    (2, "Bob", None, "San Francisco"),
    (3, None, 35, None),
    (4, "David", None, "Seattle"),
    (5, "Eve", 30, None),
    (None, None, None, None)
]

columns = ["id", "name", "age", "city"]

df = spark.createDataFrame(data, columns)


In [31]:
df.show()

+----+-----+----+-------------+
|  id| name| age|         city|
+----+-----+----+-------------+
|   1|Alice|  29|     New York|
|   2|  Bob|NULL|San Francisco|
|   3| NULL|  35|         NULL|
|   4|David|NULL|      Seattle|
|   5|  Eve|  30|         NULL|
|NULL| NULL|NULL|         NULL|
+----+-----+----+-------------+



In [33]:
df.dropna().show()

+---+-----+---+--------+
| id| name|age|    city|
+---+-----+---+--------+
|  1|Alice| 29|New York|
+---+-----+---+--------+



In [35]:
df.dropna(subset=["name"]).show()

+---+-----+----+-------------+
| id| name| age|         city|
+---+-----+----+-------------+
|  1|Alice|  29|     New York|
|  2|  Bob|NULL|San Francisco|
|  4|David|NULL|      Seattle|
|  5|  Eve|  30|         NULL|
+---+-----+----+-------------+



In [43]:
df.dropna("all").show()

+---+-----+----+-------------+
| id| name| age|         city|
+---+-----+----+-------------+
|  1|Alice|  29|     New York|
|  2|  Bob|NULL|San Francisco|
|  3| NULL|  35|         NULL|
|  4|David|NULL|      Seattle|
|  5|  Eve|  30|         NULL|
+---+-----+----+-------------+



In [73]:
df.na.fill({
"age":1.7,
    "name":"NA",
    "city":"NA",
    "id":-1
    
    
}).show()

+---+-----+---+-------------+
| id| name|age|         city|
+---+-----+---+-------------+
|  1|Alice| 29|     New York|
|  2|  Bob|  1|San Francisco|
|  3|   NA| 35|           NA|
|  4|David|  1|      Seattle|
|  5|  Eve| 30|           NA|
| -1|   NA|  1|           NA|
+---+-----+---+-------------+



In [79]:
df.na.fill(17).na.fill("NA").show()


+---+-----+---+-------------+
| id| name|age|         city|
+---+-----+---+-------------+
|  1|Alice| 29|     New York|
|  2|  Bob| 17|San Francisco|
|  3|   NA| 35|           NA|
|  4|David| 17|      Seattle|
|  5|  Eve| 30|           NA|
| 17|   NA| 17|           NA|
+---+-----+---+-------------+



In [37]:
print(oldcols)
print(newcols)

print(zip(oldcols,newcols))



for i,j in zip(oldcols,newcols):
    print(f"old col is {i}  and new col is {j}")

['id', 'name', 'age', 'city']
['id_x', 'name_x', 'age_x', 'city_x']
<zip object at 0x146f7b680>
old col is id  and new col is id_x
old col is name  and new col is name_x
old col is age  and new col is age_x
old col is city  and new col is city_x


In [41]:
oldcols = df.columns
newcols = []

for i in oldcols:
    newcols.append(i+"_x")

newcols


['id_x', 'name_x', 'age_x', 'city_x']

In [43]:
df.select([ df[i].alias(j) for i,j in zip(oldcols,newcols)])


DataFrame[id_x: bigint, name_x: string, age_x: bigint, city_x: string]

In [23]:

df.select([df[i].alias(f"{i}_x") for i in df.columns ]).show()


+----+------+-----+-------------+
|id_x|name_x|age_x|       city_x|
+----+------+-----+-------------+
|   1| Alice|   29|     New York|
|   2|   Bob| NULL|San Francisco|
|   3|  NULL|   35|         NULL|
|   4| David| NULL|      Seattle|
|   5|   Eve|   30|         NULL|
+----+------+-----+-------------+



In [7]:
[df[i].alias(f"{i}_x") for i in df.columns]

[Column<'id AS id_x'>,
 Column<'name AS name_x'>,
 Column<'age AS age_x'>,
 Column<'city AS city_x'>]