####2. Programatically try to find couple of data patterns applying below EDA (File: logistics_source1)
1. Apply inferSchema and toDF to create a DF and analyse the actual data.
2. Analyse the schema, datatypes, columns etc.,
3. Analyse the duplicate records count and summary of the dataframe.

In [0]:
# ============================================================
# PYSPARK EDA - Logistics Data Analysis
# ============================================================

# 1. Read CSV and create DataFrame with renamed columns
df = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1.txt",
    header=True,           # First row is column names
    inferSchema=True       # Automatically detect data types
).toDF("Ship_id", "fname", "lname", "age", "role")

# ============================================================
# 2. Display the actual data (first 20 rows)
# ============================================================
display(df)

# ============================================================
# 3. View the Schema (column names and their data types)
# ============================================================
print("SCHEMA DETAILS:")
df.printSchema()

# ============================================================
# 4. Show Data Types of each column
# ============================================================
print("\nDATA TYPES:")
print(df.dtypes)

# ============================================================
# 5. Show all column names
# ============================================================
print("\nCOLUMN NAMES:")
print(df.columns)

# ============================================================
# 6. Check for Duplicate Records (by Ship_id)
# ============================================================
print("\nDUPLICATE CHECK - Ship_ids appearing more than once:")
df.groupBy("Ship_id").count().filter("count > 1").show()

# ============================================================
# 7. Show Summary Statistics (count, mean, stddev, min, max)
# ============================================================
print("\nDATAFRAME SUMMARY:")
df.summary().show()



# ============================================================
# BONUS: Additional useful checks
# ============================================================

from pyspark.sql.functions import sum, col

# Check total rows
print(f"Total Rows: {df.count()}")

# Check null values in each column
print(f"\nNull Value Count by Column:")
df.select([
    sum(col(c).isNull().cast("int")).alias(c) 
    for c in df.columns
]).show()

In [0]:
# ============================================================
# 1. LOAD BOTH FILES
# ============================================================

# Load logistics_source1 (master_v1)
master_v1 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1.txt",
    header=True,
    inferSchema=True
).toDF("Ship_id", "fname", "lname", "age", "role")

# Load logistics_source2 (master_v2)
master_v2 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2.txt",
    header=True,
    inferSchema=True
).toDF("Ship_id", "fname", "lname", "age", "role","location","vhicle_type")

print("="*60)
print("MASTER_V1 - First 5 rows:")
print("="*60)
master_v1.show(5)

print("\n" + "="*60)
print("MASTER_V2 - First 5 rows:")
print("="*60)
master_v2.show(5)


In [0]:

# ============================================================
# FIND COMMON SHIPMENT IDs BETWEEN TWO DATASETS
# ============================================================

# Extract unique Ship_ids from both datasets
v1_ids = master_v1.select("Ship_id").distinct()
v2_ids = master_v2.select("Ship_id").distinct()

# Find common Ship_ids (intersection - present in both)
common_ids = v1_ids.intersect(v2_ids)

# Display results
print(f"Common Shipment IDs: {common_ids.count()}")
common_ids.show()



In [0]:
from pyspark.sql.functions import col

non_numeric=master_v1.filter(col("Ship_id").rlike("^[0-9]"))
non_numeric.show()

If you want the COUNT (interview-friendly)

In [0]:
non_numeric_count = master_v1.filter(
    ~col("Ship_id").rlike("^[0-9]+$")
).count()

non_numeric_count2 = master_v2.filter(
    ~col("Ship_id").rlike("^[0-9]+$")
).count()


print("Non-numeric Ship_id count:", non_numeric_count)
print("Non-numeric Ship_id count:", non_numeric_count2)


age is not an integer

In [0]:
invalid_age_count = master_v1.filter(
    ~col("age").cast("string").rlike("^[0-9]+$")
).count()

print("Non-integer age count:", invalid_age_count)


In [0]:
invalid_age_count2 = master_v2.filter(
    ~col("age").cast("string").rlike("^[0-9]+$")
).count()

print("Non-integer age count:", invalid_age_count2)

###**b. Active Data Munging** File: logistics_source1 and logistics_source2

#####1.Combining Data + Schema Merging (Structuring)
1. Read both files without enforcing schema
2. Align them into a single canonical schema: shipment_id,
first_name,
last_name,
age,
role,
hub_location,
vehicle_type,
data_source
3. Add data_source column with values as: system1, system2 in the respective dataframes

In [0]:
# ============================================================
# DATA INTEGRATION - Combining Two Sources with Schema Alignment
# ============================================================

from pyspark.sql.functions import lit, col

# ============================================================
# STEP 1: Read both files without enforcing schema
# ============================================================

df_s1 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1.txt",
    header=True
)

df_s2 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2.txt",
    header=True
)

print("SOURCE 1 - Columns:")
print(df_s1.columns)
df_s1.show()

print("\nSOURCE 2 - Columns:")
print(df_s2.columns)
df_s2.show()

# ============================================================
# STEP 2: Align columns to canonical schema
# ============================================================
# Both sources already have: shipment_id, first_name, last_name, age, role
# Need to add: hub_location, vehicle_type (if missing)

df_s1_aligned = df_s1.select(
    col("shipment_id"),
    col("first_name"),
    col("last_name"),
    col("age"),
    col("role"), 
)

df_s2_aligned = df_s2.select(
    col("shipment_id"),
    col("first_name"),
    col("last_name"),
    col("age"),
    col("role"),
    col("hub_location"),
    col("vehicle_type")
)

# ============================================================
# STEP 3: Add data_source column
# ============================================================

df_s1_with_source = df_s1_aligned.withColumn("data_source", lit("system1"))
df_s2_with_source = df_s2_aligned.withColumn("data_source", lit("system2"))

print("\nSOURCE 1 - With data_source:")
df_s1_with_source.show()

print("\nSOURCE 2 - With data_source:")
df_s2_with_source.show()

# ============================================================
# STEP 4: Merge both dataframes
# ============================================================

df_merged = df_s1_with_source.unionByName(
    df_s2_with_source,
    allowMissingColumns=True
)

print("\n" + "="*60)
print("MERGED DATA - Final Schema")
print("="*60)
display(df_merged)

print(f"\nTotal rows: {df_merged.count()}")