In [24]:
!pip install pandas xlsxwriter

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [25]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("records").getOrCreate()

In [27]:
spark.sql("DROP TABLE IF EXISTS demo.nyc.taxis_100M_time")

DataFrame[]

In [28]:
import os
iceberg_table_dir = "../warehouse/nyc/taxis_100M_time"
metadata_dir = f"{iceberg_table_dir}/metadata"
data_dir = f"{iceberg_table_dir}/data"
input_data_dir = f"../input_data"
analysis_info = []
records_before_op = 0

def append_to_file(file_path, msg):
    open_mode = "a"
    if not os.path.exists(file_path):
        open_mode = "w"

    # Open the CSV file in write mode
    with open(file_path, open_mode) as file:
        writer = csv.writer(file)
        
        if open_mode=="w":
            #writing header of the columns
            writer.writerows([list(msg.keys())])    

        row_values = [list(msg.values())]
        # Write the data to the CSV file
        writer.writerows(row_values)

def get_size():
    # List the metadata files
    manifest_pattern = re.compile(r".*-m\d+\.avro$")
    metadata_files = os.listdir(metadata_dir)
    
    # Initialize variables to store the sizes of different types of metadata files
    snap_avro_size = 0
    metadata_json_size = 0
    m_avro_size = 0

    data_dir_size = 0
    # get data dir size
    data_dir_files = os.listdir(data_dir)
    # print(data_dir_files)
    for filename in data_dir_files:
        file_path = os.path.join(data_dir, filename)
        data_dir_size += os.path.getsize(file_path) / 1024  # Convert size to KB
    
    # Iterate through the metadata files and calculate their sizes
    for file in metadata_files:
        file_path = os.path.join(metadata_dir, file)
        file_size_kb = os.path.getsize(file_path) / 1024  # Convert size to KB
        
        if file.startswith("snap-") and file.endswith(".avro"):
            snap_avro_size += file_size_kb
        elif file.endswith(".metadata.json"):
            metadata_json_size += file_size_kb
        elif manifest_pattern.match(file):
            m_avro_size += file_size_kb
    
    # Print the time taken and the sizes of the metadata files
    # print(f"Time taken to read Parquet files: {time_taken:.2f} seconds")
    # print(f"Size of snap-*.avro files: {snap_avro_size:.2f} KB")
    # print(f"Size of *.metadata.json files: {metadata_json_size:.2f} KB")
    # print(f"Size of *m{0-9}{1,}.avro files: {m_avro_size:.2f} KB")

    return {"data_dir_size": data_dir_size,"metadata_size": metadata_json_size,"snapshot_size": snap_avro_size,"manifest_size": m_avro_size}


In [29]:
from pyspark.sql.types import DoubleType, FloatType, LongType, StructType,StructField, StringType
schema = StructType([
  StructField("vendor_id", LongType(), True),
  StructField("trip_id", LongType(), True),
  StructField("trip_distance", FloatType(), True),
  StructField("fare_amount", DoubleType(), True),
  StructField("store_and_fwd_flag", StringType(), True)
])

df = spark.createDataFrame([], schema)
df.writeTo("demo.nyc.taxis_100M_time").create()

In [30]:
df = spark.table("demo.nyc.taxis_100M_time")
df.show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
+---------+-------+-------------+-----------+------------------+



In [31]:
import time, csv
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F
import os

input_data_dir = f"../input_data"
output_dir = f"../output"
analysis_info = []
records_before_op = 0
total_insertion_time = 0  # Track total insertion time

file_type = input("Enter input file type csv or parquet? : ")
file_type = file_type.lower().strip()
input_data_dir = os.path.join(input_data_dir, file_type)
input_files = os.listdir(input_data_dir)

analysis_file = os.path.join(output_dir, f"analysis_info_{file_type}.csv")
if os.path.exists(analysis_file):
    os.remove(analysis_file)

df = spark.table("demo.nyc.taxis_100M_time")
records_before_op = df.count()
digits = len(str(records_before_op))

for file in input_files:
    print(f"Started with file={file}")
    file_path = os.path.join(input_data_dir, file)

    st = time.time()
    if file_type == "parquet":
        df = spark.read.parquet(file_path)
    else:
        df = spark.read.csv(file_path, header=True)
        df = df.select(
            F.col("vendor_id").cast("long").alias("vendor_id"),
            F.col("trip_id").cast("long").alias("trip_id"),
            F.col("trip_distance").cast("float").alias("trip_distance"),
            F.col("fare_amount").cast("double").alias("fare_amount"),
            F.col("store_and_fwd_flag").cast("string").alias("store_and_fwd_flag")
        )
    
    rows = df.count()
    
    df.writeTo("demo.nyc.taxis_100M_time").append()
    end = time.time() - st
    total_insertion_time += end  # Accumulate insertion time

    details = get_size()
    details["time_taken"] = f"{end:.2f} sec"
    details["Operation"] = f"Inserted {rows} records"
    details["records_after_op"] = records_before_op + rows

    records_before_op += rows
    del st, end, df

    append_to_file(analysis_file, details)
    analysis_info.append(details)
    
    print(f"Inserted {rows} records..")

    current_digit = len(str(records_before_op))

    if current_digit <= digits:
        continue
    else:
        digits = current_digit
        df = spark.table("demo.nyc.taxis_100M_time")
    
    for vendor_id in df.select('vendor_id').distinct().collect()[:10]:
        vendor_id = vendor_id[0]
        df = spark.table("demo.nyc.taxis_100M_time")

        st = time.time()
        updated_df = df.withColumn("fare_amount", 
                                  when(col("vendor_id") == vendor_id, col("fare_amount") + 40)
                                  .otherwise(col("fare_amount")))
        
        updated_df.writeTo("demo.nyc.taxis_100M_time").overwritePartitions()
        
        end = time.time() - st
        rows = updated_df.filter(updated_df['vendor_id'] == vendor_id).count()

        details = get_size()
        details["time_taken"] = f"{end:.2f} sec"
        details["Operation"] = f"Updated {rows} records"
        details["records_after_op"] = records_before_op

        append_to_file(analysis_file, details)

        del df, st, end
        analysis_info.append(details)

# Print total insertion time at the end
print(f"\nTotal insertion time: {total_insertion_time:.2f} sec")


Enter input file type csv or parquet? :  parquet


Started with file=records_1000000_part_100_1740547783.2989817.parquet


                                                                                

Inserted 1000000 records..


                                                                                

Started with file=records_1000000_part_10_1740547209.7968745.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_11_1740547215.9141135.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_12_1740547221.9808776.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_13_1740547228.0179105.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_14_1740547234.0888622.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_15_1740547240.20878.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_16_1740547246.3656824.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_17_1740547252.5549655.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_18_1740547258.8807647.parquet


                                                                                

Inserted 1000000 records..


                                                                                

Started with file=records_1000000_part_19_1740547265.0029325.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_1_1740547153.5197105.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_20_1740547271.2182305.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_21_1740547277.262875.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_22_1740547283.3299098.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_23_1740547289.5509975.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_24_1740547295.6357532.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_25_1740547301.863174.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_26_1740547307.9797058.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_27_1740547314.0961952.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_28_1740547320.4824963.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_29_1740547326.5237.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_2_1740547159.983474.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_30_1740547332.5923827.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_31_1740547338.623164.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_32_1740547344.6956196.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_33_1740547350.739232.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_34_1740547356.850658.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_35_1740547362.9836311.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_36_1740547369.1775753.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_37_1740547375.3506045.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_38_1740547381.595612.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_39_1740547387.6890054.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_3_1740547166.2537405.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_40_1740547393.8891764.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_41_1740547399.9262688.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_42_1740547406.0923812.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_43_1740547412.3387566.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_44_1740547418.4367056.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_45_1740547424.532087.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_46_1740547430.6064637.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_47_1740547436.929214.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_48_1740547443.5012379.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_49_1740547449.5087569.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_4_1740547172.5165799.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_50_1740547455.553979.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_51_1740547461.7229598.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_52_1740547467.8907857.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_53_1740547474.0810313.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_54_1740547480.1119976.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_55_1740547486.1806648.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_56_1740547492.2651308.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_57_1740547498.717969.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_58_1740547505.986954.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_59_1740547513.2172508.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_5_1740547178.553217.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_60_1740547523.6317384.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_61_1740547530.5293283.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_62_1740547537.3221047.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_63_1740547547.9841735.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_64_1740547554.6004264.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_65_1740547561.5953689.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_66_1740547567.8492243.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_67_1740547573.9968793.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_68_1740547580.025012.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_69_1740547586.1098204.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_6_1740547184.6246915.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_70_1740547592.3176243.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_71_1740547598.4390452.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_72_1740547604.560089.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_73_1740547610.772545.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_74_1740547616.9331264.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_75_1740547623.7795403.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_76_1740547630.7060952.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_77_1740547637.3890622.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_78_1740547644.4125934.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_79_1740547651.155154.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_7_1740547190.8141873.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_80_1740547658.0245886.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_81_1740547664.4655297.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_82_1740547670.8240905.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_83_1740547677.1278582.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_84_1740547683.4724002.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_85_1740547689.5082061.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_86_1740547695.7361305.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_87_1740547702.0348315.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_88_1740547708.0646222.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_89_1740547714.1614273.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_8_1740547197.1843672.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_90_1740547720.3530486.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_91_1740547726.4141378.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_92_1740547732.543917.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_93_1740547738.7075405.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_94_1740547744.8750415.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_95_1740547750.989129.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_96_1740547757.0304103.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_97_1740547763.2530928.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_98_1740547770.060633.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_99_1740547776.6405113.parquet


                                                                                

Inserted 1000000 records..
Started with file=records_1000000_part_9_1740547203.7631633.parquet


                                                                                

Inserted 1000000 records..


                                                                                


Total insertion time: 193.18 sec


## Perform operations

Enter input file type csv or parquet? :  parquet


Started with file=records_1000000_part_10_1740473176.1294348.parquet


                                                                                

inserted 1000000 records..


                                                                                

Started with file=records_1000000_part_1_1740473116.5725977.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_2_1740473123.6205893.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_3_1740473130.7518847.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_4_1740473137.6493435.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_5_1740473144.4021842.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_6_1740473151.2823489.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_7_1740473157.6888306.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_8_1740473163.7258036.parquet


                                                                                

inserted 1000000 records..
Started with file=records_1000000_part_9_1740473169.8142922.parquet


                                                                                

inserted 1000000 records..


                                                                                

In [8]:
print(f"raw analysis text file created: {analysis_file}")

raw analysis text file created: ../output/analysis_info_parquet.csv


In [9]:
analysis_info

[{'data_dir_size': 0.5,
  'metadata_size': 1.0,
  'snapshot_size': 1.0,
  'manifest_size': 0.5,
  'time_taken': '5.029073238372803 sec',
  'Operation': 'Inserted 1000000 records',
  'records_after_op': 1000000},
 {'data_dir_size': 1.0,
  'metadata_size': 1.5,
  'snapshot_size': 1.5,
  'manifest_size': 1.5,
  'time_taken': '2.3605992794036865 sec',
  'Operation': 'Updated 9941 records',
  'records_after_op': 1000000},
 {'data_dir_size': 1.5,
  'metadata_size': 2.0,
  'snapshot_size': 2.0,
  'manifest_size': 2.5,
  'time_taken': '2.579869508743286 sec',
  'Operation': 'Updated 9886 records',
  'records_after_op': 1000000},
 {'data_dir_size': 2.0,
  'metadata_size': 2.5,
  'snapshot_size': 2.5,
  'manifest_size': 3.5,
  'time_taken': '2.046767234802246 sec',
  'Operation': 'Updated 10106 records',
  'records_after_op': 1000000},
 {'data_dir_size': 2.5,
  'metadata_size': 3.0,
  'snapshot_size': 3.0,
  'manifest_size': 4.5,
  'time_taken': '2.135364532470703 sec',
  'Operation': 'Updated 1

In [10]:
import pandas as pd

In [70]:
if not os.path.exists(analysis_file) and analysis_info:
    pd.DataFrame(analysis_info).to_csv(analysis_file)

## Analysis file

In [71]:
import pandas as pd
file_type = "parquet"
df = pd.read_csv(analysis_file)

df = df.rename(columns={"Unnamed: 0":"sr_no","data_dir_size": "data_dir_size_kb",\
                        "metadata_size":"metadata_size_kb", "snapshot_size": "snapshot_size_kb",\
                        "manifest_size":"manifest_size_kb", "time_taken": "time_taken_sec"})

if "sr_no" not in df.columns:
    df["sr_no"] = df.index

df["sr_no"]+=1
df['total_size_kb'] = df['data_dir_size_kb']+df['metadata_size_kb']+df['snapshot_size_kb']+df['manifest_size_kb']

df['total_size_mb'] = df['total_size_kb'].apply(lambda x: round(x/1024,3))
df['time_taken_sec'] = df['time_taken_sec'].apply(lambda x: round(float(x.split()[0]), 3))

columns = ['sr_no','records_after_op', 'Operation', 'time_taken_sec', 'data_dir_size_kb', 'metadata_size_kb', \
           'snapshot_size_kb', 'manifest_size_kb', 'total_size_kb', 'total_size_mb']

df = df[columns]

# dump to excel file

def save_df_to_excel(df, output_file, record_digitwise_output_file):
    """
    Save DataFrame to an Excel file with different sheets based on the 'records_after_op' column.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        output_file (str): The path to the output Excel file.
    """
    # Create a Pandas Excel writer using XlsxWriter as the engine
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        # Get unique values in the 'records_after_op' column
        unique_values = df['records_after_op'].unique()

        # Iterate through unique values and save each subset to a different sheet
        for value in unique_values:
            subset_df = df[df['records_after_op'] == value]
            sheet_name = f"{value}_records"
            subset_df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    print(f"DataFrame saved to {output_file}")
    # another excel file for multiply of 10 records in each sheet based on digits of records_after_op
    # eg. val = 20000 -> 5 digits
    df['digits'] = df['records_after_op'].apply(lambda x: len(str(x)))
    
    with pd.ExcelWriter(record_digitwise_output_file, engine='xlsxwriter') as writer:
        # Get unique values in the 'records_after_op' column
        unique_values = df['digits'].unique()

        # Iterate through unique values and save each subset to a different sheet
        for value in unique_values:
            subset_df = df[df['digits'] == value]
            records = subset_df.iloc[df.shape[0]-1, 1]
            sheet_name = f"{value}_digits_{records}_records"
            subset_df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Digitalwise DataFrame saved to {record_digitwise_output_file}")



# Define the output Excel file path
output_file = f'../output/_analysis_details_{file_type}_{df.loc[df.shape[0]-1, "records_after_op"]}.xlsx'
record_digitwise_output_file = f'../output/_digitwise_analysis_details_{file_type}_{df.loc[df.shape[0]-1, "records_after_op"]}.xlsx'

# Save the DataFrame to the Excel file
save_df_to_excel(df, output_file, record_digitwise_output_file)


DataFrame saved to ../output/_analysis_details_parquet_1000000.xlsx
Digitalwise DataFrame saved to ../output/_digitwise_analysis_details_parquet_1000000.xlsx
