## Liquid Clustering Demo part -2
### No need to mention the column. instead use CLUSTER BY AUTO 
#### step 1 : enable it by alter table
#### step 2 : execute Optimize command

In [0]:
display(dbutils.fs.ls("dbfs:/databricks-datasets/nyctaxi/tables/nyctaxi_yellow"))

In [0]:
# Path to dataset
base_path = "dbfs:/databricks-datasets/nyctaxi/tables/nyctaxi_yellow"

# List all files under the folder
files = dbutils.fs.ls(base_path)

# Filter only part files (usually start with 'part-')
part_files = [f.path for f in files if "part-" in f.name]

# Take the first 10
first_10_files = part_files[:10]

print("Files being read:")
for f in first_10_files:
    print(f)

# Read only these 10 files into a dataframe
df = spark.read.format("parquet").load(first_10_files)

# Show preview
df.display()


In [0]:
%sql
desc extended delta.`dbfs:/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`

In [0]:
%sql

drop table if exists demo_taxi_200_files;
CREATE TABLE demo_taxi_200_files (
  vendor_id STRING,
  trip_distance DOUBLE,
  dropoff_longitude DOUBLE,
  pickup_latitude DOUBLE,
  tolls_amount DOUBLE,
  tip_amount DOUBLE,
  pickup_longitude DOUBLE,
  passenger_count INT,
  store_and_fwd_flag STRING,
  extra DOUBLE,
  dropoff_datetime TIMESTAMP,
  rate_code_id INT,
  total_amount DOUBLE,
  mta_tax DOUBLE,
  payment_type STRING,
  fare_amount DOUBLE,
  pickup_datetime TIMESTAMP,
  dropoff_latitude DOUBLE
)
USING DELTA
TBLPROPERTIES (
  'delta.autoOptimize.optimizeWrite' = 'false',
  'delta.autoOptimize.autoCompact' = 'false'
);


In [0]:

display(df.show())

In [0]:
df.repartition(200).write.format("delta").mode("overwrite").saveAsTable("demo_taxi_200_files")

In [0]:
%sql
select * from demo_taxi_200_files

In [0]:
%sql
select count(*) from demo_taxi_200_files

In [0]:
%sql
desc detail demo_taxi_200_files

In [0]:
%sql
select count(*) from demo_taxi_200_files
where trip_distance>100;

### Enable liquid cluster for this table using AUTO

In [0]:
%sql
alter table demo_taxi_200_files cluster by AUTO


In [0]:
%sql
desc history demo_taxi_200_files

In [0]:
%sql
select count(*) from demo_taxi_200_files
where trip_distance>100

In [0]:
%sql
select min(trip_distance), max(trip_distance), _metadata.file_name 
from demo_taxi_200_files
group by _metadata.file_name
order by min(trip_distance);

In [0]:
%sql
optimize demo_taxi_200_files
    


In [0]:
%sql
desc history demo_taxi_200_files

In [0]:
%sql
select min(trip_distance), max(trip_distance), _metadata.file_name 
from demo_taxi_200_files
group by _metadata.file_name
order by min(trip_distance);

In [0]:
%sql
select count(*) from demo_taxi_200_files
where trip_distance>100

## To disable Liquid Clustering 

In [0]:
%sql
alter table demo_taxi_200_files cluster by none
    


In [0]:
%sql
desc detail demo_taxi_200_files