In [0]:
from pyspark.sql.session import SparkSession
spark1= SparkSession.builder.getOrCreate()
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

1)Create Catalog
2)Create Schema (Database)
3)Create Volume

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS telecom_catalog_assign;
CREATE SCHEMA IF NOT EXISTS telecom_catalog_assign.landing_zone;
CREATE VOLUME IF NOT EXISTS telecom_catalog_assign.landing_zone.landing_vol;

In [0]:
#Create folders using dbutils.fs.mkdirs
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/")


- > a. Volume vs DBFS/FileStore
- > _D_B_FS/FileStore is for convenience and temporary files, while Volumes are for secure, governed, production-grade data storage.___

-** Volumes**
- Governed by Unity Catalog
- Fine-grained access control (catalog/schema/volume)
- Full auditing and lineage
- Secure and compliant (GDPR, HIPAA, SOC2)
- Recommended for production systems

-** DBFS / FileStore**
- Not governed by Unity Catalog
- Limited access control
- Minimal auditing
- Meant for temporary or non-regulated data
- Not suitable for production use

b. Why production teams prefer Volumes for regulated data
- Provide strong security and access control
- Enable auditing and compliance reporting
- Support regulatory standards (GDPR, HIPAA, SOC2)
- Allow environment isolation (dev / qa / prod)
- Backed by secure cloud storage (S3, ADLS, GCS)
- Reduce risk of data leaks and unauthorized access

Data files to use in this usecase:

dbutils.fs.put(path, contents, overwrite)

## DATA

In [0]:
customer_csv = """101,Arun,31,Chennai,PREPAID
102,Meera,45,Bangalore,POSTPAID
103,Irfan,29,Hyderabad,PREPAID
104,Raj,52,Mumbai,POSTPAID
105,,27,Delhi,PREPAID
106,Sneha,abc,Pune,PREPAID
"""

usage_tsv = """customer_id\tvoice_mins\tdata_mb\tsms_count
101\t320\t1500\t20
102\t120\t4000\t5
103\t540\t600\t52
104\t45\t200\t2
105\t0\t0\t0
"""

tower_logs_region1 = """event_id|customer_id|tower_id|signal_strength|timestamp
5001|101|TWR01|-80|2025-01-10 10:21:54
5004|104|TWR05|-75|2025-01-10 11:01:12
"""


### STEP 1: Write raw data to Volume

dbutils.fs.put(path, content, overwrite=True)
- A Databricks filesystem utility
- Writes plain text content to storage
- Runs on driver only
- No parallel execution
- No schema
- No data validation
- Fast for small files

In [0]:
dbutils.fs.put(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",
    customer_csv,
    overwrite=True
)

dbutils.fs.put(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage.tsv",
    usage_tsv,
    overwrite=True
)

dbutils.fs.put(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_region1.txt",
    tower_logs_region1,
    overwrite=True
)


### STEP 2: Define Manual Schemas
- StructField(column_name, data_type, nullable)
- True → column can contain NULL values
- False → column must NOT contain NULL

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

customer_schema = StructType([
    StructField("cust_id", IntegerType(), True),
    StructField("cust_name", StringType(), True),
    StructField("age", StringType(), True),   # abc exists → string
    StructField("city", StringType(), True),
    StructField("plan_type", StringType(), True)
])

usage_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("voice_mins", IntegerType(), True),
    StructField("data_mb", IntegerType(), True),
    StructField("sms_count", IntegerType(), True)
])

tower_schema = StructType([
    StructField("event_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("tower_id", StringType(), True),
    StructField("signal_strength", IntegerType(), True),
    StructField("timestamp", StringType(), True)
])


### STEP 3: Create DataFrames
- This code reads a CSV file from a Databricks Volume using a manually defined schema and delimiter, treating all rows as data.

In [0]:
customer_df = spark.read
    .schema(customer_schema) 
    .option("header", "false") 
    .option("sep", ",") 
    .csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv")

usage_df = spark.read \
    .schema(usage_schema) \
    .option("header", "true") \
    .option("sep", "\t") \
    .csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage.tsv")
    
tower_df = spark.read \
    .schema(tower_schema) \
    .option("header", "true") \
    .option("sep", "|") \
    .csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_region1.txt")


Validate

In [0]:
customer_df.show()
customer_df.printSchema()

usage_df.show()
tower_df.show()

"/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv"

###  Directory Read Use Cases

### Path Glob Filter (pathGlobFilter)
✔ Reads
- Only .csv files
- Only from one directory

In [0]:
# Only read CSV files in region1
tower_df_glob = spark.read \
    .option("header", "true") \
    .option("pathGlobFilter", "*.csv") \
    .csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/")
#here it reads .csv files only

### Multiple Paths Input

In [0]:
tower_df_multi = spark.read \
    .option("header", "true") \
    .csv([
        "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/",
        "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/"
    ])

tower_df_multi.show()
    #Reads All files from listed paths

### Recursive Lookup (recursiveFileLookup) ->  Read all files inside all subfolders automatically.

In [0]:
tower_df_recursive = spark.read \
    .option("header", "true") \
    .option("recursiveFileLookup", "true") \
    .csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/")

tower_df_recursive.show()


Glob filter → pattern-based read

Multiple paths → controlled read

Recursive lookup → full directory scan

### Case 1: header=False, inferSchema=False

samplingratio

In [0]:
customer_df1 = spark.read.csv(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",
    header=False,
    inferSchema=False,
    sep=","
)

customer_df1.show()
customer_df1.printSchema()


### Case 2: header=True, inferSchema=True

In [0]:
customer_df2 = spark.read.csv(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",
    header=True,
    inferSchema=True,
    sep=","
)

customer_df2.show()
customer_df2.printSchema()


| Option                              | Behavior                                        |
| ----------------------------------- | ----------------------------------------------- |
| header=False                        | Spark treats **first row as data**              |
| header=True                         | Spark treats **first row as column names**      |
| inferSchema=False                   | All columns are **string**                      |
| inferSchema=True                    | Spark tries to **guess column types**           |
| inferSchema=True + bad data (`abc`) | Column becomes **string** to handle mixed types |


### 3️⃣ How Spark handles "abc" in age
- age column has mostly integers
- One value is "abc"
- inferSchema=True → Spark cannot cast to integer
- Result → Column type is string (to accommodate all values)

✅ Summary Notes
Use header=True when the CSV contains column names.
Use inferSchema=True to automatically detect types.
If column has mixed types, Spark picks string.
header=False + inferSchema=False → Safe fallback for raw ingestion; all data is string.


### Customer Data – Rename Columns Using toDF
- toDF() allows renaming columns easily.
- All columns are string by default.

In [0]:
customer_csv = """101,Arun,31,Chennai,PREPAID
102,Meera,45,Bangalore,POSTPAID
103,Irfan,29,Hyderabad,PREPAID
104,Raj,52,Mumbai,POSTPAID
105,,27,Delhi,PREPAID
106,Sneha,abc,Pune,PREPAID
"""

customer_df = spark.read.csv(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv",
    header=False,
    inferSchema=False,
    sep=","
)

# Rename columns using toDF(wrapper)
customer_df = customer_df.toDF("cust_id", "cust_name", "age", "city", "plan_type")

customer_df.show()
customer_df.printSchema()

Usage Data – Apply Columns & Datatypes Using schema Function
- Using schema ensures proper datatypes.
- header=True uses first row as column names.

In [0]:
usage_tsv = """customer_id\tvoice_mins\tdata_mb\tsms_count
101\t320\t1500\t20
102\t120\t4000\t5
103\t540\t600\t52
104\t45\t200\t2
105\t0\t0\t0
"""

from pyspark.sql.types import StructType, StructField, IntegerType

# Define schema for usage data
usage_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("voice_mins", IntegerType(), True),
    StructField("data_mb", IntegerType(), True),
    StructField("sms_count", IntegerType(), True)
])

# Read usage TSV with schema
usage_df = spark.read.csv(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage.tsv",
    sep="\t",
    header=True,
    schema=usage_schema
)

usage_df.show()
usage_df.printSchema()


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

tower_schema = StructType([
    StructField("event_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("tower_id", StringType(), True),
    StructField("signal_strength", IntegerType(), True),
    StructField("timestamp", StringType(), True)  # Use TimestampType() if you want
])

tower_df = spark.read.csv(
    "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_region1.txt",
    sep="|",              # File is pipe-delimited
    header=True,          # First row is header
    schema=tower_schema   # Apply manual schema
)

tower_df.show()
tower_df.printSchema()



| Dataset  | Method       | Notes                               |
| -------- | ------------ | ----------------------------------- |
| Customer | `toDF()`     | Rename columns, types remain string |
| Usage    | `.schema()`  | Column names + datatypes explicitly |
| Towers   | `StructType` | Column names + detailed datatypes   |


### PySpark Code to Create DataFrames

In [0]:
customer_df = spark.read \
    .schema(customer_schema) \
    .option("header", "false") \
    .option("sep", ",") \
    .csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer.csv")

### PySpark .write Operations