PySpark dataframe slicing row wise using four different methods.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

schema = StructType([
    StructField("Product", StringType(), nullable=False),
    StructField("Price", IntegerType(), nullable=False),
    StructField("Quantity", IntegerType(), nullable=False)
])

data = {
    'Product': ['Apple', 'Banana', 'Orange', 'Grapes'],
    'Price': [210, 110, 250, 300],
    'Quantity': [10, 15, 8, 12]
}
df = spark.createDataFrame(list(zip(data["Product"], data["Price"], data["Quantity"])), schema=schema)


In [0]:
# Method 1: Using filter()
filtered_df = df.filter(df.Price >= 200)
display(filtered_df)


Product,Price,Quantity
Apple,210,10
Orange,250,8
Grapes,300,12


In [0]:
# Method 2: Using where()
filtered_df = df.where(df.Price >= 200)
display(filtered_df)

Product,Price,Quantity
Apple,210,10
Orange,250,8
Grapes,300,12


In [0]:
# Method 3: Using SQL-like syntax
filtered_df = df.filter("Price >= 200")
display(filtered_df)

Product,Price,Quantity
Apple,210,10
Orange,250,8
Grapes,300,12


In [0]:
# Method 4: Using SQL expressions
from pyspark.sql.functions import col

filtered_df = df.filter(col("Price") >= 200)
display(filtered_df)

Product,Price,Quantity
Apple,210,10
Orange,250,8
Grapes,300,12


In [0]:
# Using random split
df1, df2 = df.randomSplit([0.20, 0.80])

df1.show()

+-------+-----+--------+
|Product|Price|Quantity|
+-------+-----+--------+
| Orange|250.0|       8|
+-------+-----+--------+



In [0]:
df2.show()

+-------+-----+--------+
|Product|Price|Quantity|
+-------+-----+--------+
|  Apple|210.0|      10|
| Banana|110.0|      15|
| Grapes|300.0|      12|
+-------+-----+--------+



In [0]:
# Columnwise slicing
df.select("Product", "Price").show()

+-------+-----+
|Product|Price|
+-------+-----+
|  Apple|  210|
| Banana|  110|
| Orange|  250|
| Grapes|  300|
+-------+-----+



Type casting and Type conversion in PySpark

In [0]:
df = df.withColumn("Price", df["Price"].cast("float"))

df.show()

+-------+-----+--------+
|Product|Price|Quantity|
+-------+-----+--------+
|  Apple|210.0|      10|
| Banana|110.0|      15|
| Orange|250.0|       8|
| Grapes|300.0|      12|
+-------+-----+--------+



In [0]:
# Creating new column in dataframe.
new_df = df.withColumn("Amount", df['Price']*df['Quantity'])
new_df.show()

+-------+-----+--------+------+
|Product|Price|Quantity|Amount|
+-------+-----+--------+------+
|  Apple|210.0|      10|2100.0|
| Banana|110.0|      15|1650.0|
| Orange|250.0|       8|2000.0|
| Grapes|300.0|      12|3600.0|
+-------+-----+--------+------+



In [0]:
# Find and drop duplicates from the csv file attached

df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/pradeepti239@gmail.com/fellowship_data.csv")

In [0]:
df1 \
    .groupby(['Duration', 'Date','Pulse','Maxpulse','Calories']) \
    .count() \
    .where('count > 1') \
    .sort('count', ascending=False) \
    .show()

+--------+------------+-----+--------+--------+-----+
|Duration|        Date|Pulse|Maxpulse|Calories|count|
+--------+------------+-----+--------+--------+-----+
|      60|'2020/12/12'|  100|     120|   250.7|    2|
+--------+------------+-----+--------+--------+-----+



In [0]:
df1 = df1.dropDuplicates()
df1.show(50)

+--------+------------+-----+--------+--------+
|Duration|        Date|Pulse|Maxpulse|Calories|
+--------+------------+-----+--------+--------+
|      45|        null|  100|     119|   282.0|
|      60|'2020/12/25'|  102|     126|   334.5|
|      60|'2020/12/15'|   98|     123|   275.0|
|      60|'2020/12/21'|  108|     131|   364.2|
|      60|    20201226|  100|     120|   250.0|
|      60|'2020/12/01'|  110|     130|   409.1|
|      45|'2020/12/24'|  105|     132|   246.0|
|      60|'2020/12/16'|   98|     120|   215.2|
|      60|'2020/12/17'|  100|     120|   300.0|
|      60|'2020/12/31'|   92|     115|   243.0|
|      60|'2020/12/11'|  103|     147|   329.3|
|      60|'2020/12/10'|   98|     124|   269.0|
|      60|'2020/12/02'|  117|     145|   479.0|
|     450|'2020/12/08'|  104|     134|   253.3|
|      60|'2020/12/12'|  100|     120|   250.7|
|      60|'2020/12/19'|  103|     123|   323.0|
|      45|'2020/12/04'|  109|     175|   282.4|
|      45|'2020/12/20'|   97|     125|  