## Synthetic Data Generation Using SDV (Synthetic Data Vault)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/kaggle/input/supplements-review/reviews_supplements.csv")
df.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,date,time
0,4,B Complex in gel cap form,I bought this along with Vit C in gel cap form...,B00012ND5G,B00012ND5G,AGDVFFLJWAQ3ULNNKF4LXID2RVSQ,11-12-2009 00:37,1,True,11-12-2009,00:37
1,5,Five Stars,great product,B00013Z0ZQ,B00013Z0ZQ,AG3BSKXHDGP6E3EGQD2SXCK6KFQQ,04-01-2015 03:11,0,True,04-01-2015,03:11
2,5,Five Stars,Came as expectedly,B00013Z0ZQ,B00013Z0ZQ,AHG2WKFD4LXPC46WWC6JMQGX52JA,27-09-2015 19:15,0,True,27-09-2015,19:15
3,5,Vitamin Shoppe Dry Vitamin A,Excellent Product ..... Fast Delivery ....... ...,B00013Z1KA,B00013Z1KA,AEOF7RT3AC4ACRX5HGIP2V3BNIHA,33:16.9,0,True,09-02-2019,19:33
4,5,Un producto que compro regularmente,Es muy buena vitamina,B00013Z1KA,B00013Z1KA,AGW2WETWQRL2PKUGTL2LU7IJ2BPQ,11:10.9,0,True,25-07-2022,14:11


In [None]:
!pip install sdv

In [5]:
import sdv
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(
    data=df,
    table_name='review_supplements')

In [6]:
metadata

{
    "tables": {
        "review_supplements": {
            "columns": {
                "rating": {
                    "sdtype": "categorical"
                },
                "title": {
                    "sdtype": "unknown",
                    "pii": true
                },
                "text": {
                    "sdtype": "unknown",
                    "pii": true
                },
                "asin": {
                    "sdtype": "categorical"
                },
                "parent_asin": {
                    "sdtype": "categorical"
                },
                "user_id": {
                    "sdtype": "unknown",
                    "pii": true
                },
                "timestamp": {
                    "sdtype": "unknown",
                    "pii": true
                },
                "helpful_vote": {
                    "sdtype": "numerical"
                },
                "verified_purchase": {
                    "sdtype": "cat

In [15]:
metadata.update_column(
    column_name='rating',
    sdtype='numerical')

metadata.update_column(
    column_name='timestamp',
    # sdtype='datetime',
    # datetime_format='%m-%d-%Y %H:%M:%S',
    sdtype= 'unknown')

metadata.update_column(
    column_name='date',
    # sdtype='datetime',
    # datetime_format='%m-%d-%Y',
    sdtype= 'unknown')

metadata.update_column(
    column_name='time',
    # sdtype='datetime',
    # datetime_format='%H:%M'
    sdtype= 'unknown')

metadata.update_column(
    column_name='verified_purchase',
    sdtype='boolean')

metadata.update_column(
    column_name='asin',
    sdtype='unknown')

metadata.update_column(
    column_name='parent_asin',
    sdtype='unknown')

metadata.update_column(
    column_name='user_id',
    sdtype='unknown')

metadata.update_column(
    column_name='title',
    sdtype='unknown')

metadata.update_column(
    column_name='text',
    sdtype='unknown')

metadata.save_to_json(filepath='metadata_v2.json')

In [16]:
from sdv.single_table import CTGANSynthesizer

synthesizer = CTGANSynthesizer(
    metadata, # required
    enforce_rounding=False,
    epochs=500,
    verbose=True
)

In [17]:
from sdv.single_table import CTGANSynthesizer

synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(df)

synthetic_data = synthesizer.sample(
    num_rows=1000,
    batch_size=50
)

Sampling rows: 100%|██████████| 1000/1000 [00:01<00:00, 713.74it/s]


In [18]:
synthetic_data.to_csv('synthetic_data.csv', index=False)