create a synthetic multivariate time series dataset of at least 1 GB

In [23]:
pip install --force-reinstall psutil


Collecting psutilNote: you may need to restart the kernel to use updated packages.

  Using cached psutil-6.1.0-cp37-abi3-win_amd64.whl.metadata (23 kB)
Using cached psutil-6.1.0-cp37-abi3-win_amd64.whl (254 kB)
Installing collected packages: psutil
  Attempting uninstall: psutil
    Found existing installation: psutil 6.1.0
    Uninstalling psutil-6.1.0:
      Successfully uninstalled psutil-6.1.0
Successfully installed psutil-6.1.0


  You can safely remove it manually.


In [24]:
import psutil
import os
import csv
import datetime
import time
import wmi


In [25]:
# Parameters
output_file = "synthetic_dataset.csv" 
sampling_rate_hz = 1  
duration_minutes = 300 
wmi_interface = wmi.WMI()  


In [26]:
def fetch_metrics():
    # Timestamp
    timestamp = datetime.datetime.now()

    # CPU temperature
    try:
        temp = wmi_interface.MSAcpi_ThermalZoneTemperature()[0].CurrentTemperature / 10.0 - 273.15
    except:
        temp = None 

    # CPU usage percentage
    cpu_usage = psutil.cpu_percent(interval=None)

    # CPU load
    cpu_load = os.getloadavg()[0] if hasattr(os, "getloadavg") else None

    # Memory usage
    memory = psutil.virtual_memory().percent

    # Battery level
    battery = psutil.sensors_battery().percent if psutil.sensors_battery() else None

    # Disk usage
    try:
        disk_usage = psutil.disk_usage("C:\\").percent  
    except Exception as e:
        print(f"Error fetching disk usage: {e}")
        disk_usage = None

    # Network stats
    net_io = psutil.net_io_counters()
    bytes_sent = net_io.bytes_sent
    bytes_recv = net_io.bytes_recv

    return [timestamp, temp, cpu_usage, cpu_load, memory, battery, disk_usage, bytes_sent, bytes_recv]


In [30]:
header = [
    "timestamp",
    "cpu_temperature",
    "cpu_usage",
    "cpu_load",
    "memory_usage",
    "battery_level",
    "disk_usage",
    "bytes_sent",
    "bytes_recv",
]

# Create and write the header to the CSV file
with open(output_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)


In [31]:
import signal


stop_collecting = False

def handle_interrupt(signal, frame):
    global stop_collecting
    stop_collecting = True
    print("\nData collection interrupted by user.")


signal.signal(signal.SIGINT, handle_interrupt)

# Start data collection
start_time = time.time()
end_time = start_time + duration_minutes * 60  # Duration in seconds

print("Data collection started... (Press Ctrl+C to stop)")

with open(output_file, "a", newline="") as f:
    writer = csv.writer(f)
    
    while time.time() < end_time and not stop_collecting:
        metrics = fetch_metrics()  
        writer.writerow(metrics)  
        
        time.sleep(1 / sampling_rate_hz)
        
        #Print progress every minute
        elapsed_seconds = int(time.time() - start_time)
        if elapsed_seconds % 60 == 0:
            current_size = os.path.getsize(output_file) / (1024 ** 2)  # File size in MB
            print(f"Elapsed time: {elapsed_seconds // 60} min, Current dataset size: {current_size:.2f} MB")

print("Data collection completed or interrupted.")


Data collection started... (Press Ctrl+C to stop)
Elapsed time: 1 min, Current dataset size: 0.00 MB
Elapsed time: 2 min, Current dataset size: 0.00 MB
Elapsed time: 3 min, Current dataset size: 0.01 MB
Elapsed time: 4 min, Current dataset size: 0.01 MB
Elapsed time: 5 min, Current dataset size: 0.02 MB
Elapsed time: 6 min, Current dataset size: 0.02 MB
Elapsed time: 7 min, Current dataset size: 0.02 MB
Elapsed time: 8 min, Current dataset size: 0.02 MB
Elapsed time: 9 min, Current dataset size: 0.03 MB
Elapsed time: 10 min, Current dataset size: 0.03 MB
Elapsed time: 11 min, Current dataset size: 0.04 MB
Elapsed time: 12 min, Current dataset size: 0.04 MB
Elapsed time: 14 min, Current dataset size: 0.05 MB
Elapsed time: 15 min, Current dataset size: 0.05 MB
Elapsed time: 16 min, Current dataset size: 0.06 MB
Elapsed time: 17 min, Current dataset size: 0.06 MB
Elapsed time: 18 min, Current dataset size: 0.06 MB
Elapsed time: 19 min, Current dataset size: 0.06 MB
Elapsed time: 20 min, C

In [None]:
# Load the existing dataset
df = pd.read_csv(output_file)

# Check current file size
current_size = os.path.getsize(output_file) / (1024 ** 3)  
print(f"Initial dataset size: {current_size:.2f} GB")

#Rows until file size reaches ~1 GB
while os.path.getsize(output_file) < (1 * 1024 ** 3): 
    df.to_csv(output_file, mode="a", index=False, header=False)

final_size = os.path.getsize(output_file) / (1024 ** 3)
print(f"Final dataset size: {final_size:.2f} GB")


Initial dataset size: 0.00 GB
Final dataset size: 1.00 GB


In [33]:
import os
file_size = os.path.getsize(output_file) / (1024 ** 3) 
print(f"Final dataset size: {file_size:.2f} GB")


Final dataset size: 1.00 GB


In [None]:
df = pd.read_csv(output_file, nrows=10) 
print(df.head())


                    timestamp  cpu_temperature  cpu_usage  cpu_load  \
0  2024-11-29 00:39:00.063474              NaN       18.1       NaN   
1  2024-11-29 00:39:01.144384              NaN       37.9       NaN   
2  2024-11-29 00:39:02.229738              NaN       37.2       NaN   
3  2024-11-29 00:39:03.256356              NaN       21.6       NaN   
4  2024-11-29 00:39:04.289264              NaN       22.7       NaN   

   memory_usage  battery_level  disk_usage  bytes_sent  bytes_recv  
0          81.2             99        60.7     2168627    84916184  
1          81.4             99        60.7     2168627    84916238  
2          81.1             99        60.7     2173181    84916294  
3          81.1             99        60.7     2173289    84916733  
4          81.0             99        60.7     2173505    84916988  


In [12]:
# Define the correct column names
correct_columns = [
    "timestamp", "cpu_temperature", "cpu_usage", "cpu_load",
    "memory_usage", "battery_level", "disk_usage", "bytes_sent", "bytes_recv"
]

# Assign the correct column names to the DataFrame
df.columns = correct_columns

# Display the updated DataFrame
print(df.head())


                    timestamp  cpu_temperature  cpu_usage  cpu_load  \
0  2024-11-27 22:38:17.267005              NaN       54.0       NaN   
1  2024-11-27 22:38:18.323268              NaN       56.6       NaN   
2  2024-11-27 22:38:19.331899              NaN       54.9       NaN   
3  2024-11-27 22:38:20.340946              NaN       32.9       NaN   
4  2024-11-27 22:38:21.350088              NaN       21.0       NaN   

   memory_usage  battery_level  disk_usage  bytes_sent  bytes_recv  
0          79.6             60         NaN     5582733    77757042  
1          79.9             60         NaN     5594898    77762338  
2          79.8             60         NaN     5595061    77762773  
3          79.9             60         NaN     5595061    77762773  
4          79.8             60         NaN     5603869    77763088  


In [None]:
print(df.head())  
print(df.info())  


                    timestamp  cpu_temperature  cpu_usage  cpu_load  \
0  2024-11-29 00:39:00.063474              NaN       18.1       NaN   
1  2024-11-29 00:39:01.144384              NaN       37.9       NaN   
2  2024-11-29 00:39:02.229738              NaN       37.2       NaN   
3  2024-11-29 00:39:03.256356              NaN       21.6       NaN   
4  2024-11-29 00:39:04.289264              NaN       22.7       NaN   

   memory_usage  battery_level  disk_usage  bytes_sent  bytes_recv  
0          81.2             99        60.7     2168627    84916184  
1          81.4             99        60.7     2168627    84916238  
2          81.1             99        60.7     2173181    84916294  
3          81.1             99        60.7     2173289    84916733  
4          81.0             99        60.7     2173505    84916988  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------ 

In [None]:
# Replace NaNs with a meaningful value or drop them
df = df.fillna(0)  # Replace NaNs with 0
df = df.dropna()  # Drop rows with NaN values

# Verify after cleaning
print(df.describe()) 

       cpu_temperature  cpu_usage  cpu_load  memory_usage  battery_level  \
count             10.0  10.000000      10.0     10.000000           10.0   
mean               0.0  21.890000       0.0     80.990000           99.0   
std                0.0   8.677743       0.0      0.213177            0.0   
min                0.0  14.800000       0.0     80.700000           99.0   
25%                0.0  15.750000       0.0     80.825000           99.0   
50%                0.0  18.400000       0.0     80.950000           99.0   
75%                0.0  22.425000       0.0     81.100000           99.0   
max                0.0  37.900000       0.0     81.400000           99.0   

       disk_usage    bytes_sent    bytes_recv  
count        10.0  1.000000e+01  1.000000e+01  
mean         60.7  2.172508e+06  8.491686e+07  
std           0.0  2.049950e+03  4.602376e+02  
min          60.7  2.168627e+06  8.491618e+07  
25%          60.7  2.173208e+06  8.491640e+07  
50%          60.7  2.173532