In [1]:
import pandas as pd
import numpy as np
import os

# Load the original dataset
def load_and_multiply_dataset(input_path, multipliers=[2, 4, 8]):
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(input_path)
    
    # Load the original dataset
    print(f"Loading dataset from {input_path}...")
    df = pd.read_csv(input_path)
    original_rows = len(df)
    print(f"Original dataset has {original_rows} rows and {len(df.columns)} columns")
    
    # Save the original as a reference (optional)
    original_output = os.path.join(output_dir, 'original_dataset.csv')
    df.to_csv(original_output, index=False)
    print(f"Saved original dataset to {original_output}")
    
    # Create and save multiplied versions
    for multiplier in multipliers:
        # Calculate how many copies we need to make
        copies_needed = multiplier
        
        # Create the multiplied dataframe
        multiplied_df = df.copy()
        
        # Append copies with randomized values
        for _ in range(copies_needed - 1):
            # For each row in the original dataset, we'll create a new row
            # by randomly sampling values from the original dataset for each column
            new_rows = []
            
            for col in df.columns:
                # Sample random values from the original column
                random_values = np.random.choice(df[col].values, size=original_rows)
                new_rows.append(random_values)
            
            # Convert to DataFrame and ensure it has the same columns as original
            new_df = pd.DataFrame(dict(zip(df.columns, new_rows)))
            
            # Add to the multiplied dataframe
            multiplied_df = pd.concat([multiplied_df, new_df], ignore_index=True)
        
        # Save the multiplied dataset
        output_path = os.path.join(output_dir, f'dataset_{multiplier}x.csv')
        multiplied_df.to_csv(output_path, index=False)
        print(f"Saved {multiplier}x dataset with {len(multiplied_df)} rows to {output_path}")

if __name__ == "__main__":
    # Path to the dataset
    dataset_path = '../flask_ml/data/Income_dataset.csv'
    
    # Create 2x, 4x, and 8x versions
    load_and_multiply_dataset(dataset_path, multipliers=[2, 4, 8])
    
    print("All dataset versions have been created successfully!")

Loading dataset from ../flask_ml/data/Income_dataset.csv...
Original dataset has 48842 rows and 14 columns
Saved original dataset to ../flask_ml/data\original_dataset.csv
Saved 2x dataset with 97684 rows to ../flask_ml/data\dataset_2x.csv
Saved 4x dataset with 195368 rows to ../flask_ml/data\dataset_4x.csv
Saved 8x dataset with 390736 rows to ../flask_ml/data\dataset_8x.csv
All dataset versions have been created successfully!
