In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd

In [0]:
def analyze_dataframe(df):
    """Analyzes the given DataFrame and prints various statistics."""
    
    # Define the ANSI escape sequence for black bold text
    bold_black = "\033[1;30m"
    reset = "\033[0m"

    # Get the number of rows
    row_count = df.count()
    print(f"{bold_black}\nNumber of rows:{reset} {row_count}")

    # Get the number of columns
    column_count = len(df.columns)
    print(f"{bold_black}Number of columns:{reset} {column_count}")

    # Get column names
    column_names = df.columns
    print(f"{bold_black}\nColumn names:{reset} {column_names}")

    # Get distinct count in each column
    distinct_counts = {col: df.select(col).distinct().count() for col in df.columns}
    print(f"{bold_black}\nDistinct counts for each column:{reset}")
    for col, count in distinct_counts.items():
        print(f"{col}: {count}")

    # Get data types for each column
    data_types = {col: df.schema[col].dataType for col in df.columns}
    print(f"{bold_black}\nData types:{reset}")
    for col, dtype in data_types.items():
        print(f"{col}: {dtype}")

    # Get null values count and percentage for each column
    null_info = {col: (df.filter(df[col].isNull()).count(), (df.filter(df[col].isNull()).count() / row_count) * 100) for col in df.columns}
    print(f"{bold_black}\nNull values count and % Null values:{reset}")
    for col, (null_count, null_percentage) in null_info.items():
        print(f"{col}: {null_count} ({null_percentage:.2f}%)")

    # Get duplicate rows count
    print(f"{bold_black}\nDuplicate Data Details:{reset}")
    duplicate_count = df.groupBy(df.columns).count().where('count > 1').count()
    if duplicate_count > 0:
        print(f"Duplicate rows count: {duplicate_count}")
    else:
        print(f"No duplicate rows found.")

In [0]:
def typecast_column(df, column_name, target_type):
    type_map = {
        'int': IntegerType(),
        'long': LongType(),
        'float': FloatType(),
        'double': DoubleType(),
        'short': ShortType(),
        'decimal': DecimalType(10, 2),
        'str': StringType(),
        'bool': BooleanType(),
        'date': DateType(),
        'timestamp': TimestampType(),
        'binary': BinaryType()
    }

    if target_type not in type_map:
        raise ValueError(f"Unsupported target type: {target_type}")

    return df.withColumn(column_name, col(column_name).cast(type_map[target_type]))

In [0]:
def handling_null_values_drop(df):
    # Drop the rows with null values
    return df.dropna()

In [0]:
def handling_null_values_mean(df):
    # Get the mean of each column
    mean_values = {col: df.select(mean(col)).collect()[0][0] for col, dtype in df.dtypes if dtype in ['int', 'double']}
    
    # Fill null values with the mean
    df_filled = df.fillna(mean_values)
    return df_filled

In [0]:
def strip_empty_space(df, columns):
    for column in columns:
        df = df.withColumn(column, trim(col(column)))
    return df

In [0]:
def remove_special_char(df, columns):
    for column in columns:
        df = df.withColumn(column, regexp_replace(col(column), '[^a-zA-Z0-9\s]', ''))
    return df

In [0]:
def handling_duplicates(df):
    # Drop duplicate rows
    return df.dropDuplicates()


In [0]:
def change_case(df, column_names, operation):
    for column_name in column_names:
        if operation == 'upper':
            df = df.withColumn(column_name, upper(df[column_name]))
        elif operation == 'lower':
            df = df.withColumn(column_name, lower(df[column_name]))
        else:
            raise ValueError("Operation must be 'upper' or 'lower'")
    
    return df


#####All functions are imported in this notebook