In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(123)

# Define environment types
environment_types = ['Soil', 'Water', 'Plant', 'Air']

# Number of samples
n_samples = 10000

# Generate the dataset
data = {
    'SampleID': [f'S{i+1}' for i in range(n_samples)],
    'EnvironmentType': np.random.choice(environment_types, n_samples, replace=True),
    'GramPositiveCount': np.round(np.random.normal(100, 30, n_samples)).astype(int),
    'GramNegativeCount': np.round(np.random.normal(100, 30, n_samples)).astype(int)
}

# Create DataFrame
df = pd.DataFrame(data)

# Ensure counts are non-negative
df['GramPositiveCount'] = df['GramPositiveCount'].apply(lambda x: max(x, 0))
df['GramNegativeCount'] = df['GramNegativeCount'].apply(lambda x: max(x, 0))

# Calculate TotalCount and Proportions
df['TotalCount'] = df['GramPositiveCount'] + df['GramNegativeCount']
df['ProportionGramPositive'] = df['GramPositiveCount'] / df['TotalCount']
df['ProportionGramNegative'] = df['GramNegativeCount'] / df['TotalCount']

# Ensure no NaN values in proportions
df['ProportionGramPositive'] = df['ProportionGramPositive'].fillna(0)
df['ProportionGramNegative'] = df['ProportionGramNegative'].fillna(0)

# Save the super massive dataset to a CSV file
df.to_csv('microbiology.csv', index=False)


: 