In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=db94203efb757e236fefa3648ada379abb5ea7dd0f2062558832a3d870436350
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, to_date, when, upper

class DataCleaningPipeline:
    def __init__(self, spark: SparkSession, data_path: str):
        self.spark = spark
        self.df = self.spark.read.option("header", "true").csv(data_path)

    def handle_missing_values(self, column: str):
        """Replace empty strings with None in the specified column."""
        self.df = self.df.withColumn(column, when(col(column) == '', None).otherwise(col(column)))
        return self

    def standardize_gender(self):
        """Standardize the gender column to 'M' and 'F'."""
        self.df = self.df.withColumn("Gender", upper(col("Gender")))
        self.df = self.df.withColumn("Gender", when(col("Gender").isin("MALE", "M"), "M")
                                     .when(col("Gender").isin("FEMALE", "F"), "F")
                                     .otherwise(None))
        return self

    def standardize_dates(self, columns: list):
        """Convert specified columns to date type."""
        for column in columns:
            self.df = self.df.withColumn(column, to_date(col(column), "yyyy-MM-dd"))
        return self

    def remove_duplicates(self, columns: list):
        """Remove duplicates based on the specified columns."""
        self.df = self.df.dropDuplicates(columns)
        return self

    def validate_email(self):
        """Validate email addresses and replace invalid emails with None."""
        self.df = self.df.withColumn("Email", when(self.df.Email.contains('@'), self.df.Email).otherwise(None))
        return self

    def drop_nulls(self, columns: list = None):
        """Drop rows with null values in the specified columns. If no columns are specified, drop rows with nulls in any column."""
        if columns:
            self.df = self.df.dropna(subset=columns)
        else:
            self.df = self.df.dropna()
        return self

    def show_data(self, num_rows: int = 20):
        """Display the cleaned data."""
        self.df.show(num_rows)

    def save_data(self, output_path: str):
        """Save the cleaned data to a specified path."""
        self.df.write.mode("overwrite").parquet(output_path)

# Initialize Spark session
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

# Initialize the pipeline with the data path
pipeline = DataCleaningPipeline(spark, "/content/Assignment Task _ Dataset - Sheet1.csv")

# Run the data cleaning steps
pipeline.handle_missing_values("Name")\
        .handle_missing_values("Email")\
        .standardize_gender()\
        .standardize_dates(["Join_Date", "Last_Login"])\
        .remove_duplicates(["ID"])\
        .validate_email()\
        .drop_nulls()  # Show cleaned data

# Save the cleaned data if needed
pipeline.save_data("/content/data")
