In [6]:
!pip install tqdm pandas xlsxwriter

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import random
import string
import pandas as pd
import os, time
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa

# Define the schema as a list of column names
columns = ["vendor_id", "trip_id", "trip_distance", "fare_amount", "store_and_fwd_flag"]

def generate_random_string(length=1):
    """Generate a random uppercase string of the given length."""
    return ''.join(random.choice(string.ascii_uppercase) for _ in range(length))

def generate_records_chunk(n):
    """Generate a chunk of records as a list of tuples."""
    return [
        (
            random.randint(1, 100),  # vendor_id
            random.randint(1, 1000000),  # trip_id
            round(random.uniform(0.5, 50.0), 2),  # trip_distance
            round(random.uniform(5.0, 500.0), 2),  # fare_amount
            generate_random_string()  # store_and_fwd_flag
        )
        for _ in range(n)
    ]

def save_records_to_file(df, extn, file_path, append=False):
    """Save the records to the given file (CSV or Parquet) efficiently."""
    if extn.lower().strip() == "parquet":
        # For Parquet, we cannot append directly like CSV.
        # For the first chunk, create the file. For subsequent chunks, we append data.
        if append:
            try:
                # Reading the existing Parquet file to append data
                existing_df = pd.read_parquet(file_path)
                combined_df = pd.concat([existing_df, df], ignore_index=True)
                combined_df.to_parquet(file_path, index=False, engine='pyarrow')
            except Exception:
                # If the file doesn't exist, we create it from scratch
                df.to_parquet(file_path, index=False, engine='pyarrow')
        else:
            df.to_parquet(file_path, index=False, engine='pyarrow')
            
    elif extn.lower().strip() == "csv":
        # For CSV, write in append mode but ensure the header is written only for the first chunk
        mode = 'a' if append else 'w'
        header = not append  # Write the header only for the first chunk
        df.to_csv(file_path, index=False, mode=mode, header=header)

def main(n, extn):
    """Main function to generate records and save them in the specified format."""
    MILLION_RECORDS = 1000000
    records_per_chunk = MILLION_RECORDS  # Customize chunk size if necessary
    
    # Create output directory if not exists
    output_dir = f'../input_data/{extn}'
    os.makedirs(output_dir, exist_ok=True)

    # File name template with placeholders for chunk number and extension
    file_path_template = os.path.join(output_dir, "records_{}_part_{}_{}.{}")

    total_chunks = (n + MILLION_RECORDS - 1) // MILLION_RECORDS  # Calculate total chunks

    for chunk_idx in tqdm(range(total_chunks), desc="Generating and saving records"):
        start_idx = chunk_idx * MILLION_RECORDS
        chunk_size = min(MILLION_RECORDS, n - start_idx)
        
        # Generate the chunk of records
        records = generate_records_chunk(chunk_size)
        df = pd.DataFrame(records, columns=columns)
        
        # Construct the file path for the current chunk
        file_path = file_path_template.format(df.shape[0], chunk_idx + 1, time.time(), extn)
        
        # Write to file
        append = (chunk_idx > 0)  # Append to the file if not the first chunk
        save_records_to_file(df, extn, file_path, append)
        del df

# Execution starts here
if __name__ == "__main__":
    number_of_records = int(input("Enter number_of_records: "))
    extension = input("File extension (parquet or csv): ")

    if extension.lower().strip() not in ["parquet", "csv"]:
        msg = "Extension should be either 'parquet' or 'csv'"
        print(msg)
        raise Exception(msg)

    main(number_of_records, extension)

Enter number_of_records:  100000000
File extension (parquet or csv):  parquet


Generating and saving records: 100%|██████████| 100/100 [10:10<00:00,  6.10s/it]


In [8]:
# 3599920
# 3627882
# 2979431
# 2463931
# 3214369