Run the below cell and click on restart runtime (button will appear at the end of logs) after it is done. It is okay if it has errors.

In [None]:
!pip install sdv
!pip install ydata-profiling

Connect to Google drive, upload [this](https://drive.google.com/drive/folders/1R6-mkarUSqNbSGtPeWYiRXswHoQ8Dfu-?usp=drive_link
) folder in the drive you are connecting to. The link can be accessed by Rubrik email account


In [3]:
#connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Run all cells sequentially from this point

In [4]:
#This cell runs a CTGAN for a fngne.csv file
#The file has 4 cols: first name, last name, email and gender

#necessary imports
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.datasets.local import load_csvs

# Update the metadata appropriately for the synthesizer to
#pick appropriate faker library while generating new data
#sdt types here : https://docs.sdv.dev/sdv/reference/metadata-spec/sdtypes
metadata = SingleTableMetadata()
metadata.detect_from_csv('/content/drive/My Drive/temp/fngne.csv')
md_dict = metadata.to_dict()
md_dict['columns']['GivenName']['pii'] = True
md_dict["columns"]["GivenName"]["sdtype"]="first_name"
md_dict['columns']['Surname']['pii'] = True
md_dict["columns"]["Surname"]["sdtype"]="last_name"
md_dict['columns']['Email']['pii'] = True
md_dict["columns"]["Email"]["sdtype"]="email"
metadata = SingleTableMetadata.load_from_dict(md_dict)
print(metadata)
metadata.validate()  #validates that the described metadata can be used for data generation

# assume that my_folder contains 1 CSV file named 'fngne.csv' - fakenamegenerator_name_email_data
datasets = load_csvs(folder_name='/content/drive/My Drive/temp/')
real_data = datasets['fngne']
print(real_data[:5])

#CTGAN model, epochs to be decided based on loss values, can stop increasing once loss is small enough(flat-lines/small enough)
synthesizer = CTGANSynthesizer(metadata, enforce_rounding=False, epochs=10, verbose=True)
#The training step
synthesizer.fit(real_data)
#data generation step
synthetic_data = synthesizer.sample(num_rows=500)
#can find this generated file by clicking on the folder icon( on the left tab.)
synthetic_data.to_csv('data_generated_ne', sep=',', index=False, encoding='utf-8')

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Gender": {
            "sdtype": "categorical"
        },
        "GivenName": {
            "sdtype": "first_name",
            "pii": true
        },
        "Surname": {
            "sdtype": "last_name",
            "pii": true
        },
        "Email": {
            "sdtype": "email",
            "pii": true
        }
    }
}




  Gender GivenName  Surname                       Email
0   male     Brian  Beazley   BrianMBeazley@teleworm.us
1   male   Richard  Russell   RichardJRussell@gustr.com
2   male       Van  Edmunds      VanDEdmunds@einrot.com
3   male     Derek    Miser      DerekEMiser@dayrep.com
4   male      Dale  Crawley  DaleJCrawley@superrito.com


KeyboardInterrupt: ignored

In [None]:
# One line profiler for the generated data
from ydata_profiling import ProfileReport
profile = ProfileReport(synthetic_data, title="Profiling Report")
print(profile)

In [1]:
#This cell runs CTGAN for the data.csv file in the shared drive folder

#necessary imports
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.datasets.local import load_csvs

# Update the metadata appropriately for the synthesizer to
#pick appropriate faker library while generating new data
#sdt types here : https://docs.sdv.dev/sdv/reference/metadata-spec/sdtypes
metadata = SingleTableMetadata()
metadata.detect_from_csv('/content/drive/My Drive/temp/data.csv')
md_dict = metadata.to_dict()
md_dict['columns']['GivenName']['pii'] = True
md_dict["columns"]["GivenName"]["sdtype"]="first_name"
md_dict['columns']['Surname']['pii'] = True
md_dict["columns"]["Surname"]["sdtype"]="last_name"
md_dict['columns']['StreetAddress']['pii'] = True
md_dict["columns"]["StreetAddress"]["sdtype"]="street_address"
md_dict['columns']['City']['pii'] = True
md_dict["columns"]["City"]["sdtype"]="city"
md_dict['columns']['EmailAddress']['pii'] = True
md_dict["columns"]["EmailAddress"]["sdtype"]="email"
md_dict['columns']['TelephoneNumber']['pii'] = True
md_dict["columns"]["TelephoneNumber"]["sdtype"]="phone_number"
md_dict['columns']['StreetAddress']['pii'] = True
md_dict["columns"]["StreetAddress"]["sdtype"]="street_address"
md_dict['columns']['CCNumber']['pii'] = True
md_dict["columns"]["CCNumber"]["sdtype"]="credit_card_number"
md_dict['columns']['NationalID']['pii'] = True
md_dict["columns"]["NationalID"]["sdtype"]="ssn"
metadata = SingleTableMetadata.load_from_dict(md_dict)

#print(metadata)
metadata.validate()

datasets = load_csvs(folder_name='/content/drive/My Drive/temp/')

# the data is available under the file name
real_data = datasets['data']
print(real_data[:5])

synthesizer = CTGANSynthesizer(metadata, enforce_rounding=False, epochs=10, verbose=True)
#synthesizer.auto_assign_transformers(real_data)
#print(synthesizer.get_transformers())

synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(num_rows=500)
synthetic_data.to_csv('data_generated', sep=',', index=False, encoding='utf-8')

KeyboardInterrupt: ignored

In [None]:
# This cell runs a FastML model for CardBase.csv file

#necessary imports
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.datasets.local import load_csvs

#fastML synthesizer
metadata = SingleTableMetadata()
metadata.detect_from_csv('/content/drive/My Drive/temp/CardBase.csv')
md_dict = metadata.to_dict()
md_dict["columns"]["Card_Number"]["pii"]=True
md_dict["columns"]["Card_Number"]["sdtype"]="credit_card_number"
metadata = SingleTableMetadata.load_from_dict(md_dict)
print(metadata)
metadata.validate()

from sdv.datasets.local import load_csvs

# assume that my_folder contains 1 CSV file named 'guests.csv'
datasets = load_csvs(folder_name='/content/drive/My Drive/temp/')

# the data is available under the file name
real_data = datasets['CardBase']

synthesizer = SingleTablePreset(metadata, name='FAST_ML')
synthesizer.fit(data=real_data)
synthetic_data = synthesizer.sample(num_rows=500)
print(synthetic_data[:5])
#print(metadata)

{
    "columns": {
        "Card_Number": {
            "sdtype": "credit_card_number",
            "pii": true
        },
        "Card_Family": {
            "sdtype": "categorical"
        },
        "Credit_Limit": {
            "sdtype": "numerical"
        },
        "Cust_ID": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


