In [None]:
import numpy as np

In [2]:
np.random.seed(42)

In [3]:
n_samples = 10

In [4]:
heights = np.random.normal(loc=170, scale=10, size=n_samples)
weights = np.random.normal(loc=65, scale=8, size=n_samples)
ages = np.random.randint(18, 41, size=n_samples)

In [None]:
# data = np.array([heights, weights, ages]).T
# data

array([[159.62170123,  53.64999633,  39.        ],
       [169.24196255,  73.33993294,  19.        ],
       [179.72963534,  72.22825993,  27.        ],
       [177.95595464,  65.15200265,  30.        ],
       [184.95434253,  60.72466837,  38.        ],
       [173.38181252,  53.03956903,  23.        ],
       [203.72296254,  58.68593337,  29.        ],
       [160.79609195,  70.94969027,  29.        ],
       [166.0136161 ,  63.30270863,  37.        ],
       [169.39135913,  61.58504965,  28.        ]])

In [5]:
data = np.column_stack([heights, weights, ages])
print('Random synthetic data: \n', np.round(data, 2))

Random synthetic data: 
 [[174.97  61.29  32.  ]
 [168.62  61.27  36.  ]
 [176.48  66.94  29.  ]
 [185.23  49.69  40.  ]
 [167.66  51.2   37.  ]
 [167.66  60.5   20.  ]
 [185.79  56.9   22.  ]
 [177.67  67.51  36.  ]
 [165.31  57.74  24.  ]
 [175.43  53.7   38.  ]]


Approach: Multivariate Normal (with covariance)

In [7]:
np.random.seed(42)
new_samples = 10
mean = [170, 65, 30]
cov_mat = [
    [100, 40, 0],
    [40, 64, 0],
    [0, 0, 25]
]

In [10]:
new_data = np.random.multivariate_normal(mean, cov_mat, size=new_samples)
new_data

array([[175.13687136,  66.95610196,  24.46832513],
       [178.54506577,  76.50038342,  36.78120014],
       [167.31345819,  70.64284232,  31.80818013],
       [174.86592328,  70.80391789,  37.69018283],
       [165.09110001,  73.33227283,  16.90127448],
       [161.96491618,  60.44476942,  28.50496325],
       [175.80004884,  54.13376385,  28.90164056],
       [161.68007654,  70.48880263,  27.40864891],
       [179.29932506,  67.32291813,  34.57701059],
       [168.67915436,  60.25013501,  32.56633717]])

In [11]:
np.round(new_data, 2)

array([[175.14,  66.96,  24.47],
       [178.55,  76.5 ,  36.78],
       [167.31,  70.64,  31.81],
       [174.87,  70.8 ,  37.69],
       [165.09,  73.33,  16.9 ],
       [161.96,  60.44,  28.5 ],
       [175.8 ,  54.13,  28.9 ],
       [161.68,  70.49,  27.41],
       [179.3 ,  67.32,  34.58],
       [168.68,  60.25,  32.57]])

In [12]:
new_data[:, 2] = np.clip(np.round(data[:, 2]), 18, 70)
new_data

array([[175.13687136,  66.95610196,  32.        ],
       [178.54506577,  76.50038342,  36.        ],
       [167.31345819,  70.64284232,  29.        ],
       [174.86592328,  70.80391789,  40.        ],
       [165.09110001,  73.33227283,  37.        ],
       [161.96491618,  60.44476942,  20.        ],
       [175.80004884,  54.13376385,  22.        ],
       [161.68007654,  70.48880263,  36.        ],
       [179.29932506,  67.32291813,  24.        ],
       [168.67915436,  60.25013501,  38.        ]])

Convert Any of These to Pandas

In [None]:
import pandas as pd
feature_names = ['Height_cm', 'Weight_kg', 'Age']
df = pd.DataFrame(new_data, columns=feature_names)
df

Unnamed: 0,Height_cm,Weight_kg,Age
0,175.136871,66.956102,32.0
1,178.545066,76.500383,36.0
2,167.313458,70.642842,29.0
3,174.865923,70.803918,40.0
4,165.0911,73.332273,37.0
5,161.964916,60.444769,20.0
6,175.800049,54.133764,22.0
7,161.680077,70.488803,36.0
8,179.299325,67.322918,24.0
9,168.679154,60.250135,38.0


In [14]:
# Add ID column
df.insert(0, 'ID', [f'P{i+1:03d}' for i in range(len(df))])
df

Unnamed: 0,ID,Height_cm,Weight_kg,Age
0,P001,175.136871,66.956102,32.0
1,P002,178.545066,76.500383,36.0
2,P003,167.313458,70.642842,29.0
3,P004,174.865923,70.803918,40.0
4,P005,165.0911,73.332273,37.0
5,P006,161.964916,60.444769,20.0
6,P007,175.800049,54.133764,22.0
7,P008,161.680077,70.488803,36.0
8,P009,179.299325,67.322918,24.0
9,P010,168.679154,60.250135,38.0
