In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [8]:
# Load data
train_data = pd.read_csv('training_set _labelled.csv')
val_data = pd.read_csv('validation_set _labelled.csv')
test_data = pd.read_csv('testing_set _labelled.csv')

print("Type of train_data:", type(train_data))
print("Type of val_data:", type(val_data))
print("Type of test_data:", type(test_data))


# Inspect and remove non-numeric columns
print("Columns in train_data:", train_data.columns)
print("Columns in val_data:", val_data.columns)
print("Columns in test_data:", test_data.columns)

# Remove non-numeric columns
train_data = train_data.select_dtypes(include=[np.number])
val_data = val_data.select_dtypes(include=[np.number])
test_data = test_data.select_dtypes(include=[np.number])

print("Numeric columns in train_data:", train_data.columns)
print("Numeric columns in val_data:", val_data.columns)
print("Numeric columns in test_data:", test_data.columns)

# Convert to numpy arrays
train_data = train_data.values
val_data = val_data.values
test_data = test_data.values

Type of train_data: <class 'pandas.core.frame.DataFrame'>
Type of val_data: <class 'pandas.core.frame.DataFrame'>
Type of test_data: <class 'pandas.core.frame.DataFrame'>
Columns in train_data: Index(['id', 'bearings', 'wpump', 'radiator', 'exvalve', 'acmotor', 'air_flow',
       'noise_db', 'outlet_temp', 'water_inlet_temp', 'water_outlet_temp',
       'water_flow', 'oil_tank_temp'],
      dtype='object')
Columns in val_data: Index(['id', 'bearings', 'wpump', 'radiator', 'exvalve', 'acmotor', 'air_flow',
       'noise_db', 'outlet_temp', 'water_inlet_temp', 'water_outlet_temp',
       'water_flow', 'oil_tank_temp'],
      dtype='object')
Columns in test_data: Index(['id', 'bearings', 'wpump', 'radiator', 'exvalve', 'acmotor', 'air_flow',
       'noise_db', 'outlet_temp', 'water_inlet_temp', 'water_outlet_temp',
       'water_flow', 'oil_tank_temp'],
      dtype='object')
Numeric columns in train_data: Index(['id', 'air_flow', 'noise_db', 'outlet_temp', 'water_inlet_temp',
       'wate

In [9]:
U, S, VT = np.linalg.svd(train_data, full_matrices=False)

# reduce dimensionality
k = 10  
U_k = U[:, :k]
S_k = np.diag(S[:k])
VT_k = VT[:k, :]

# Generate synthetic data
synthetic_data = np.dot(U_k, np.dot(S_k, VT_k))

In [10]:
scaler = MinMaxScaler()  
scaler.fit(train_data)   
synthetic_data = scaler.inverse_transform(synthetic_data)

print("Shape of synthetic_data:", synthetic_data.shape)


columns = train_data.columns if isinstance(train_data, pd.DataFrame) else None


if columns is not None:
    synthetic_df = pd.DataFrame(synthetic_data, columns=columns)
else:
    synthetic_df = pd.DataFrame(synthetic_data)


print("Shape of synthetic_data:", synthetic_data.shape)


synthetic_df.to_csv('synthetic_data.csv', index=False)

Shape of synthetic_data: (700, 8)
Shape of synthetic_data: (700, 8)
