In [4]:
# Checking System Details in Python

import platform
import os
import sys
import psutil 
import GPUtil
from datetime import datetime

print("="*50)
print("SYSTEM INFORMATION")
print("="*50)

# Basic system information
print(f"System: {platform.system()}")
print(f"Node Name: {platform.node()}")
print(f"Release: {platform.release()}")
print(f"Version: {platform.version()}")
print(f"Machine: {platform.machine()}")
print(f"Processor: {platform.processor()}")
print(f"Architecture: {platform.architecture()}")
print(f"Platform: {platform.platform()}")

print("\n" + "="*50)
print("PYTHON INFORMATION")
print("="*50)

print(f"Python Version: {sys.version}")


print("\n" + "="*50)
print("HARDWARE INFORMATION")
print("="*50)

# CPU information
print(f"Physical cores: {psutil.cpu_count(logical=False)}")
print(f"Total cores: {psutil.cpu_count(logical=True)}")
print(f"CPU frequency: {psutil.cpu_freq().current:.2f} MHz")

# Memory information
memory = psutil.virtual_memory()
print(f"Total Memory: {memory.total / (1024**3):.2f} GB")
print(f"Available Memory: {memory.available / (1024**3):.2f} GB")
print(f"Used Memory: {memory.used / (1024**3):.2f} GB")
print(f"Memory Percentage: {memory.percent}%")

print("\n" + "="*50)
print("GPU INFORMATION")
print("="*50)

# GPU information using multiple methods
gpu_info = []
gpus = GPUtil.getGPUs()
if gpus:
    for i, gpu in enumerate(gpus):
        gpu_info.append({
            'method': 'GPUtil',
            'gpu_id': i,
            'name': gpu.name,
            'driver': gpu.driver,
            'memory_total': f"{gpu.memoryTotal} MB",
            'memory_used': f"{gpu.memoryUsed} MB",
            'memory_free': f"{gpu.memoryFree} MB",
            'temperature': f"{gpu.temperature}°C",
            'uuid': gpu.uuid
        })

if gpu_info:
    for gpu in gpu_info:
        print(f"GPU {gpu['gpu_id']} ({gpu['method']}):")
        print(f"  Name: {gpu['name']}")
        if 'driver' in gpu:
            print(f"  Driver: {gpu['driver']}")
        if 'driver_version' in gpu:
            print(f"  Driver Version: {gpu['driver_version']}")
        if 'memory_total' in gpu:
            print(f"  Memory Total: {gpu['memory_total']}")
        if 'memory_used' in gpu:
            print(f"  Memory Used: {gpu['memory_used']}")
        if 'memory_free' in gpu:
            print(f"  Memory Free: {gpu['memory_free']}")
        if 'memory' in gpu:
            print(f"  Memory: {gpu['memory']}")
        if 'temperature' in gpu:
            print(f"  Temperature: {gpu['temperature']}")
        if 'uuid' in gpu:
            print(f"  UUID: {gpu['uuid']}")
        print()



print("\n" + "="*50)
print("DISK INFORMATION")
print("="*50)

# Disk usage
disk = psutil.disk_usage('/')
print(f"Total Disk Space: {disk.total / (1024**3):.2f} GB")
print(f"Used Disk Space: {disk.used / (1024**3):.2f} GB")
print(f"Free Disk Space: {disk.free / (1024**3):.2f} GB")


print(f"\nCurrent Working Directory: {os.getcwd()}")
print(f"Current Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

SYSTEM INFORMATION
System: Windows
Node Name: INDu4VlGVd3xEbP
Release: 10
Version: 10.0.22631
Machine: AMD64
Processor: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
Architecture: ('64bit', 'WindowsPE')
Platform: Windows-10-10.0.22631-SP0

PYTHON INFORMATION
Python Version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]

HARDWARE INFORMATION
Physical cores: 8
Total cores: 16
CPU frequency: 2496.00 MHz
Total Memory: 31.71 GB
Available Memory: 12.34 GB
Used Memory: 19.37 GB
Memory Percentage: 61.1%

GPU INFORMATION
GPU 0 (GPUtil):
  Name: NVIDIA T1200 Laptop GPU
  Driver: 573.22
  Memory Total: 4096.0 MB
  Memory Used: 1439.0 MB
  Memory Free: 2497.0 MB
  Temperature: 56.0°C
  UUID: GPU-ed354bed-c936-f245-af1e-3be8ef05b7e5


DISK INFORMATION
Total Disk Space: 931.50 GB
Used Disk Space: 328.03 GB
Free Disk Space: 603.47 GB

Current Working Directory: d:\MTech BITS\Study\Sem2\MLSOptimization\Assignment
Current Time: 2025-07-30 11:41:58


In [1]:
# Generate Synthetic Dataset for K-Means Clustering
# 500,000 samples with 5 features

import numpy as np
import pandas as pd
from datetime import datetime
import os

print("="*60)
print("SYNTHETIC DATASET GENERATION FOR K-MEANS CLUSTERING")
print("="*60)

# Set random seed for reproducibility
np.random.seed(42)

# Dataset parameters
n_samples = 500000
n_features = 5

print(f"Generating dataset with {n_samples:,} samples and {n_features} features...")

start_time = datetime.now()

# Create natural cluster centers (6 clusters in 5D space)
n_clusters = 6
cluster_centers = np.array([
    [25, 40, 30, 20, 15],     # Cluster 1
    [45, 60, 70, 40, 35],     # Cluster 2  
    [35, 80, 50, 60, 55],     # Cluster 3
    [55, 30, 80, 30, 75],     # Cluster 4
    [65, 70, 90, 80, 85],     # Cluster 5
    [20, 50, 20, 70, 25]      # Cluster 6
])

# Generate samples around cluster centers
samples_per_cluster = n_samples // n_clusters
remaining_samples = n_samples % n_clusters

all_data = []

for i, center in enumerate(cluster_centers):
    # Number of samples for this cluster
    if i < remaining_samples:
        cluster_size = samples_per_cluster + 1
    else:
        cluster_size = samples_per_cluster
    
    # Generate samples with controlled variance around each center
    cluster_data = np.random.multivariate_normal(
        mean=center, 
        cov=np.eye(n_features) * 25,  # Diagonal covariance matrix
        size=cluster_size
    )
    
    all_data.append(cluster_data)

# Combine all clusters and shuffle
data = np.vstack(all_data)
shuffle_indices = np.random.permutation(n_samples)
data = data[shuffle_indices]

# Ensure all values are positive and reasonable
data = np.abs(data)
data = np.clip(data, 1, 100)  # Clip values between 1 and 100

# Create DataFrame with meaningful column names
df = pd.DataFrame(data, columns=[
    'feature_1',
    'feature_2', 
    'feature_3',
    'feature_4',
    'feature_5'
])

# Round values to 2 decimal places
df = df.round(2)

generation_time = datetime.now() - start_time

print(f"✅ Dataset generation completed in: {generation_time.total_seconds():.2f} seconds")
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)

# Display basic information
print("First 10 rows:")
print(df.head(10))

print("\nDataset Statistics:")
print(df.describe())

print("\nData Types:")
print(df.dtypes)

print("\nNull Values Check:")
print(df.isnull().sum())

# Feature ranges
print("\nFeature Ranges:")
for col in df.columns:
    print(f"{col}: {df[col].min():.2f} to {df[col].max():.2f}")

print("\n" + "="*50)
print("SAVE TO CSV")
print("="*50)

# Save to CSV
csv_filename = "synthetic_kmeans_dataset_500k.csv"
save_start_time = datetime.now()

print(f"Saving dataset to '{csv_filename}'...")

df.to_csv(csv_filename, index=False)

save_time = datetime.now() - save_start_time
file_size = os.path.getsize(csv_filename) / (1024**2)

print(f"✅ Dataset saved successfully!")
print(f"File: {csv_filename}")
print(f"File size: {file_size:.2f} MB")
print(f"Save time: {save_time.total_seconds():.2f} seconds")
print(f"Full path: {os.path.abspath(csv_filename)}")

print("\n" + "="*50)
print("VERIFICATION")
print("="*50)

# Verify saved file
print("Verifying saved file...")
df_loaded = pd.read_csv(csv_filename, nrows=5)
print("✅ File verification successful!")
print("\nFirst 5 rows from saved file:")
print(df_loaded)

print(f"\n🎉 Successfully generated and saved {n_samples:,} samples with {n_features} features!")
print("📊 Dataset is ready for K-means clustering analysis")

SYNTHETIC DATASET GENERATION FOR K-MEANS CLUSTERING
Generating dataset with 500,000 samples and 5 features...
✅ Dataset generation completed in: 0.18 seconds
Dataset shape: (500000, 5)
Memory usage: 19.07 MB

DATASET OVERVIEW
First 10 rows:
   feature_1  feature_2  feature_3  feature_4  feature_5
0      53.77      30.65      77.06      30.01      84.48
1      46.20      61.64      69.88      49.38      29.37
2      30.38      41.18      37.53      24.31       7.24
3      48.40      31.26      78.66      34.08      75.95
4      46.84      70.51      70.92      45.65      39.63
5      50.67      59.43      64.67      26.88      38.63
6      22.76      51.25      20.50      64.28      31.59
7      58.35      69.99      96.87      78.82      83.03
8      49.37      29.12      69.83      36.18      76.36
9      48.18      24.97      84.68      33.56      76.27

Dataset Statistics:
           feature_1      feature_2      feature_3      feature_4  \
count  500000.000000  500000.000000  50000