####2. Programatically try to find couple of data patterns applying below EDA (File: logistics_source1)
1. Apply inferSchema and toDF to create a DF and analyse the actual data.
2. Analyse the schema, datatypes, columns etc.,
3. Analyse the duplicate records count and summary of the dataframe.

In [0]:
# ============================================================
# PYSPARK EDA - Logistics Data Analysis
# ============================================================

# 1. Read CSV and create DataFrame with renamed columns
df = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1.txt",
    header=True,           # First row is column names
    inferSchema=True       # Automatically detect data types
).toDF("Ship_id", "fname", "lname", "age", "role")

# ============================================================
# 2. Display the actual data (first 20 rows)
# ============================================================
display(df)

# ============================================================
# 3. View the Schema (column names and their data types)
# ============================================================
print("SCHEMA DETAILS:")
df.printSchema()

# ============================================================
# 4. Show Data Types of each column
# ============================================================
print("\nDATA TYPES:")
print(df.dtypes)

# ============================================================
# 5. Show all column names
# ============================================================
print("\nCOLUMN NAMES:")
print(df.columns)

# ============================================================
# 6. Check for Duplicate Records (by Ship_id)
# ============================================================
print("\nDUPLICATE CHECK - Ship_ids appearing more than once:")
df.groupBy("Ship_id").count().filter("count > 1").show()

# ============================================================
# 7. Show Summary Statistics (count, mean, stddev, min, max)
# ============================================================
print("\nDATAFRAME SUMMARY:")
df.summary().show()



# ============================================================
# BONUS: Additional useful checks
# ============================================================

from pyspark.sql.functions import sum, col

# Check total rows
print(f"Total Rows: {df.count()}")

# Check null values in each column
print(f"\nNull Value Count by Column:")
df.select([
    sum(col(c).isNull().cast("int")).alias(c) 
    for c in df.columns
]).show()

In [0]:
# ============================================================
# 1. LOAD BOTH FILES
# ============================================================

# Load logistics_source1 (master_v1)
master_v1 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source1.txt",
    header=True,
    inferSchema=True
).toDF("Ship_id", "fname", "lname", "age", "role")

# Load logistics_source2 (master_v2)
master_v2 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/logistics/logistics_source2.txt",
    header=True,
    inferSchema=True
).toDF("Ship_id", "fname", "lname", "age", "role","location","vhicle_type")

print("="*60)
print("MASTER_V1 - First 5 rows:")
print("="*60)
master_v1.show(5)

print("\n" + "="*60)
print("MASTER_V2 - First 5 rows:")
print("="*60)
master_v2.show(5)


In [0]:

# ============================================================
# FIND COMMON SHIPMENT IDs BETWEEN TWO DATASETS
# ============================================================

# Extract unique Ship_ids from both datasets
v1_ids = master_v1.select("Ship_id").distinct()
v2_ids = master_v2.select("Ship_id").distinct()

# Find common Ship_ids (intersection - present in both)
common_ids = v1_ids.intersect(v2_ids)

# Display results
print(f"Common Shipment IDs: {common_ids.count()}")
common_ids.show()

