# Importation of the libraries

In [3]:
import pandas as pd
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

# Load datas and metadatas
Metadatas are needed to describe the meaning of each columns

In [4]:
FILE_PATH = "data/student_interest.csv"
datas = pd.read_csv(FILE_PATH)

metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath=FILE_PATH)


# Loading the metadatas and the datas in the model

In [5]:
synthesizer = SingleTablePreset(metadata,name='FAST_ML',locales=['en_US', 'id_ID'])
synthesizer.fit(datas)

# Generating the fake datas

In [8]:
synthetic_data = synthesizer.sample(num_rows=1000000,batch_size=5_000)

Sampling rows: 100%|██████████| 100000/100000 [00:01<00:00, 69310.02it/s]


# Evaluating the quality of the datas

In [9]:
quality_report = evaluate_quality(
    real_data=datas,
    synthetic_data=synthetic_data,
    metadata=metadata)

diagnostic_report = run_diagnostic(
    real_data=datas,
    synthetic_data=synthetic_data,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:02<00:00,  1.39it/s]



Overall Quality Score: 44.8%

Properties:
Column Shapes: 83.96%
Column Pair Trends: 5.64%


Creating report: 100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data





# Save the datas in a CSV file

In [36]:
synthetic_data.to_csv('data/datasets.csv',index=False)