In [1]:
!pip install tqdm pandas xlsxwriter

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import random
import string
import pandas as pd
import os, time
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
from datetime import datetime, timedelta

# Define the schema with additional columns
# columns = ["vendor_id", "trip_id", "trip_distance", "fare_amount", "store_and_fwd_flag"]
columns = [f"extra_col_{i}" for i in range(50)]  # 45 additional columns
# columns.extend(extra_columns)

def generate_random_string(length=5):
    """Generate a random uppercase string of the given length."""
    return ''.join(random.choice(string.ascii_uppercase) for _ in range(length))

def generate_random_date():
    """Generate a random date within the last 10 years."""
    start_date = datetime.today() - timedelta(days=365 * 10)
    random_days = random.randint(0, 365 * 10)
    random_date = start_date + timedelta(days=random_days)
    return random_date.date()  # This returns a date object (not string)

# def generate_random_date():
#     """Generate a random date within the last 10 years."""
#     start_date = datetime.today() - timedelta(days=365 * 10)
#     random_days = random.randint(0, 365 * 10)
#     return (start_date + timedelta(days=random_days)).strftime('%Y-%m-%d')

def generate_records_chunk(n):
    """Generate a chunk of records with specified data types."""
    return [
        (
            # random.randint(1, 100),  # vendor_id (INT)
            # random.randint(1, 1000000),  # trip_id (INT)
            # round(random.uniform(0.5, 50.0), 2),  # trip_distance (FLOAT)
            # round(random.uniform(5.0, 500.0), 2),  # fare_amount (FLOAT)
            # generate_random_string(1),  # store_and_fwd_flag (STRING)
            # *
            [
                generate_random_string(10) if i % 4 == 0 else  # VARCHAR
                random.randint(1, 10000) if i % 4 == 1 else  # INT
                generate_random_string(15) if i % 4 == 2 else  # STRING
                generate_random_date()  # DATE
                for i in range(50)
            ]
        )
        for _ in range(n)
    ]

def save_records_to_file(df, extn, file_path, append=False):
    """Save the records to the given file (CSV or Parquet) efficiently."""
    if extn.lower().strip() == "parquet":
        if append:
            try:
                existing_df = pd.read_parquet(file_path)
                combined_df = pd.concat([existing_df, df], ignore_index=True)
                combined_df.to_parquet(file_path, index=False, engine='pyarrow')
            except Exception:
                df.to_parquet(file_path, index=False, engine='pyarrow')
        else:
            df.to_parquet(file_path, index=False, engine='pyarrow')

    elif extn.lower().strip() == "csv":
        mode = 'a' if append else 'w'
        header = not append  # Write the header only for the first chunk
        df.to_csv(file_path, index=False, mode=mode, header=header)

def main(n, extn):
    """Main function to generate records and save them."""
    MILLION_RECORDS = 1000000
    records_per_chunk = MILLION_RECORDS

    # Change the directory to "parquet_50" inside "input_data"
    output_dir = f'../input_data/{extn}'
    os.makedirs(output_dir, exist_ok=True)

    file_path_template = os.path.join(output_dir, "records_{}_part_{}_{}.{}")

    total_chunks = (n + MILLION_RECORDS - 1) // MILLION_RECORDS

    for chunk_idx in tqdm(range(total_chunks), desc="Generating and saving records"):
        start_idx = chunk_idx * MILLION_RECORDS
        chunk_size = min(MILLION_RECORDS, n - start_idx)

        records = generate_records_chunk(chunk_size)
        df = pd.DataFrame(records, columns=columns)

        file_path = file_path_template.format(df.shape[0], chunk_idx + 1, time.time(), extn)

        append = (chunk_idx > 0)
        save_records_to_file(df, extn, file_path, append)
        del df

if __name__ == "__main__":
    number_of_records = int(input("Enter number_of_records: "))
    extension = input("File extension (parquet or csv): ")

    if extension.lower().strip() not in ["parquet", "csv"]:
        msg = "Extension should be either 'parquet' or 'csv'"
        print(msg)
        raise Exception(msg)

    main(number_of_records, extension)


Enter number_of_records:  10
File extension (parquet or csv):  parquet


Generating and saving records: 100%|██████████| 1/1 [00:00<00:00,  8.28it/s]


In [10]:
# import random
# import string
# import pandas as pd
# import os, time
# from tqdm import tqdm
# import pyarrow.parquet as pq
# import pyarrow as pa

# # Define the schema with additional columns
# columns = ["vendor_id", "trip_id", "trip_distance", "fare_amount", "store_and_fwd_flag"]
# extra_columns = [f"extra_col_{i}" for i in range(1, 46)]  # 45 additional columns
# columns.extend(extra_columns)

# def generate_random_string(length=1):
#     """Generate a random uppercase string of the given length."""
#     return ''.join(random.choice(string.ascii_uppercase) for _ in range(length))

# def generate_records_chunk(n):
#     """Generate a chunk of records with additional columns."""
#     return [
#         (
#             random.randint(1, 100),  # vendor_id
#             random.randint(1, 1000000),  # trip_id
#             round(random.uniform(0.5, 50.0), 2),  # trip_distance
#             round(random.uniform(5.0, 500.0), 2),  # fare_amount
#             generate_random_string(),  # store_and_fwd_flag
#             *[random.uniform(0, 1000) for _ in range(45)]  # Additional columns with random floats
#         )
#         for _ in range(n)
#     ]

# def save_records_to_file(df, extn, file_path, append=False):
#     """Save the records to the given file (CSV or Parquet) efficiently."""
#     if extn.lower().strip() == "parquet":
#         if append:
#             try:
#                 existing_df = pd.read_parquet(file_path)
#                 combined_df = pd.concat([existing_df, df], ignore_index=True)
#                 combined_df.to_parquet(file_path, index=False, engine='pyarrow')
#             except Exception:
#                 df.to_parquet(file_path, index=False, engine='pyarrow')
#         else:
#             df.to_parquet(file_path, index=False, engine='pyarrow')

#     elif extn.lower().strip() == "csv":
#         mode = 'a' if append else 'w'
#         header = not append  # Write the header only for the first chunk
#         df.to_csv(file_path, index=False, mode=mode, header=header)

# def main(n, extn):
#     """Main function to generate records and save them."""
#     MILLION_RECORDS = 1000000
#     records_per_chunk = MILLION_RECORDS

#     # Change the directory to "parquet_50" inside "input_data"
#     output_dir = f'../input_data/parquet_50/{extn}'
#     os.makedirs(output_dir, exist_ok=True)

#     file_path_template = os.path.join(output_dir, "records_{}_part_{}_{}.{}")

#     total_chunks = (n + MILLION_RECORDS - 1) // MILLION_RECORDS

#     for chunk_idx in tqdm(range(total_chunks), desc="Generating and saving records"):
#         start_idx = chunk_idx * MILLION_RECORDS
#         chunk_size = min(MILLION_RECORDS, n - start_idx)

#         records = generate_records_chunk(chunk_size)
#         df = pd.DataFrame(records, columns=columns)

#         file_path = file_path_template.format(df.shape[0], chunk_idx + 1, time.time(), extn)

#         append = (chunk_idx > 0)
#         save_records_to_file(df, extn, file_path, append)
#         del df

# if __name__ == "__main__":
#     number_of_records = int(input("Enter number_of_records: "))
#     extension = input("File extension (parquet or csv): ")

#     if extension.lower().strip() not in ["parquet", "csv"]:
#         msg = "Extension should be either 'parquet' or 'csv'"
#         print(msg)
#         raise Exception(msg)

#     main(number_of_records, extension)


Enter number_of_records:  10000000
File extension (parquet or csv):  parquet


Generating and saving records: 100%|██████████| 10/10 [07:20<00:00, 44.00s/it]


In [4]:

# import random
# import string
# import pandas as pd
# import os, time
# from tqdm import tqdm
# import pyarrow.parquet as pq
# import pyarrow as pa

# # Define the schema with additional columns
# columns = ["vendor_id", "trip_id", "trip_distance", "fare_amount", "store_and_fwd_flag"]
# extra_columns = [f"extra_col_{i}" for i in range(1, 46)]  # 45 additional columns
# columns.extend(extra_columns)

# def generate_random_string(length=1):
#     """Generate a random uppercase string of the given length."""
#     return ''.join(random.choice(string.ascii_uppercase) for _ in range(length))

# def generate_records_chunk(n):
#     """Generate a chunk of records with additional columns."""
#     return [
#         (
#             random.randint(1, 100),  # vendor_id
#             random.randint(1, 1000000),  # trip_id
#             round(random.uniform(0.5, 50.0), 2),  # trip_distance
#             round(random.uniform(5.0, 500.0), 2),  # fare_amount
#             generate_random_string(),  # store_and_fwd_flag
#             *[random.uniform(0, 1000) for _ in range(45)]  # Additional columns with random floats
#         )
#         for _ in range(n)
#     ]

# def save_records_to_file(df, extn, file_path, append=False):
#     """Save the records to the given file (CSV or Parquet) efficiently."""
#     if extn.lower().strip() == "parquet":
#         if append:
#             try:
#                 existing_df = pd.read_parquet(file_path)
#                 combined_df = pd.concat([existing_df, df], ignore_index=True)
#                 combined_df.to_parquet(file_path, index=False, engine='pyarrow')
#             except Exception:
#                 df.to_parquet(file_path, index=False, engine='pyarrow')
#         else:
#             df.to_parquet(file_path, index=False, engine='pyarrow')

#     elif extn.lower().strip() == "csv":
#         mode = 'a' if append else 'w'
#         header = not append  # Write the header only for the first chunk
#         df.to_csv(file_path, index=False, mode=mode, header=header)

# def main(n, extn):
#     """Main function to generate records and save them."""
#     MILLION_RECORDS = 1000000
#     records_per_chunk = MILLION_RECORDS

#     output_dir = f'../input_data/{extn}'
#     os.makedirs(output_dir, exist_ok=True)

#     file_path_template = os.path.join(output_dir, "records_{}_part_{}_{}.{}")

#     total_chunks = (n + MILLION_RECORDS - 1) // MILLION_RECORDS

#     for chunk_idx in tqdm(range(total_chunks), desc="Generating and saving records"):
#         start_idx = chunk_idx * MILLION_RECORDS
#         chunk_size = min(MILLION_RECORDS, n - start_idx)

#         records = generate_records_chunk(chunk_size)
#         df = pd.DataFrame(records, columns=columns)

#         file_path = file_path_template.format(df.shape[0], chunk_idx + 1, time.time(), extn)

#         append = (chunk_idx > 0)
#         save_records_to_file(df, extn, file_path, append)
#         del df

# if __name__ == "__main__":
#     number_of_records = int(input("Enter number_of_records: "))
#     extension = input("File extension (parquet or csv): ")

#     if extension.lower().strip() not in ["parquet", "csv"]:
#         msg = "Extension should be either 'parquet' or 'csv'"
#         print(msg)
#         raise Exception(msg)

#     main(number_of_records, extension)


Enter number_of_records:  10000000
File extension (parquet or csv):  parquet


Generating and saving records: 100%|██████████| 10/10 [00:59<00:00,  5.98s/it]


In [None]:
# 3599920
# 3627882
# 2979431
# 2463931
# 3214369