In [6]:
from sdv.constraints.utils import (cast_to_datetime64, logit, matches_datetime_format, sigmoid,
                                   get_nan_component_value, compute_nans_column, revert_nans_columns)

data = pd.DataFrame(data={
  'A': [0, 1, np.nan, np.nan],
  'B': [2, np.nan, 3, np.nan],
  'C': [4, 5, 6, np.nan]
})

print(data[['A', 'B']].apply(get_nan_component_value, axis=1))


0    None
1       B
2       A
3    A, B
dtype: object


In [2]:
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

data = pd.DataFrame(data={
  'A': [0, 1, np.nan, np.nan, 2],
  'B': [2, np.nan, 3, np.nan, 3]
})

metadata = SingleTableMetadata.load_from_dict({
    'columns': {
        'A': { 'sdtype': 'numerical' },
        'B': { 'sdtype': 'numerical'},
    }
})

synthesizer = GaussianCopulaSynthesizer(metadata)

synthesizer.add_constraints([{
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'A',
        'high_column_name': 'B',
    }
}])

synthesizer.fit(data)
synthetic_data = synthesizer.sample(10000)

# there are no cases where A is null but B is present
synthetic_data[~(pd.isna(synthetic_data['A'])) & (pd.isna(synthetic_data['B']))]

Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 177307.77it/s]


Unnamed: 0,A,B
2,0.0,
8,2.0,
9,0.0,
10,0.0,
16,0.0,
...,...,...
9973,0.0,
9978,0.0,
9979,0.0,
9980,0.0,


In [6]:
synthetic_data[~(pd.isna(synthetic_data['A'])) & ~(pd.isna(synthetic_data['B']))]

Unnamed: 0,A,B


In [19]:
out = out.drop('a,b_nans?', axis=1)
out

KeyError: "['a,b_nans?'] not found in axis"

In [21]:
from sdv.constraints.tabular import (
    FixedCombinations, FixedIncrements, Inequality, Negative, OneHotEncoding, Positive, Range,
    ScalarInequality, ScalarRange, Unique, _RecreateCustomConstraint,
    _validate_inputs_custom_constraint, create_custom_constraint_class)

instance = Inequality(low_column_name='a', high_column_name='b')
instance._diff_column_name = 'a#b'

# Run
table_data = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
})
out = instance._transform(table_data)

# Assert
expected_out = pd.DataFrame({
    'a': [1, 2, 3],
    'c': [7, 8, 9],
    'a#b': [np.log(4)] * 3,
})
pd.testing.assert_frame_equal(out, expected_out)

AssertionError: DataFrame are different

DataFrame shape mismatch
[left]:  (3, 4)
[right]: (3, 3)

In [10]:
synthetic_data

Unnamed: 0,A,B
0,,3.0
1,,3.0
2,,
3,,3.0
4,,3.0
...,...,...
9995,,3.0
9996,,
9997,,3.0
9998,,3.0


In [2]:
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

col_low = np.random.uniform(0,20, size=10000)
data = pd.DataFrame(data={
  'A': col_low,
  'B': col_low + np.random.uniform(0.1,5, size=10000)
})
nan_probability = 0.2

# Create a mask with the same shape as the DataFrame, with True values indicating where NaNs should be added
mask = np.random.choice([True, False], size=data.shape, p=[nan_probability, 1 - nan_probability])

# Apply the mask to the DataFrame, replacing values with NaN where the mask is True
data = data.mask(mask)

metadata = SingleTableMetadata.load_from_dict({
    'columns': {
        'A': { 'sdtype': 'numerical' },
        'B': { 'sdtype': 'numerical'},
    }
})

synthesizer = GaussianCopulaSynthesizer(metadata)

synthesizer.add_constraints([{
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'A',
        'high_column_name': 'B',
    }
}])

synthesizer.fit(data)
synthetic_data = synthesizer.sample(10000)

# there are no cases where A is null but B is present
synthetic_data[(pd.isna(synthetic_data['A'])) & ~(pd.isna(synthetic_data['B']))]

Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 147359.87it/s]


Unnamed: 0,A,B
18,,14.823711
28,,13.419769
31,,7.608372
32,,9.968984
35,,8.767856
...,...,...
9963,,19.332450
9964,,7.036633
9965,,5.023763
9980,,11.511691


In [6]:
synthetic_data

Unnamed: 0,A,B
0,14.883702,17.741714
1,4.168635,
2,12.534272,
3,,
4,7.772598,11.565074
...,...,...
9995,6.790652,11.482641
9996,,
9997,4.504839,
9998,8.234026,9.103778


In [3]:
data.isnull().sum()

A    1984
B    2030
dtype: int64

In [4]:
synthetic_data.isnull().sum()

A    2152
B    3189
dtype: int64

In [5]:
synthetic_data[(pd.isna(synthetic_data['A'])) & (pd.isna(synthetic_data['B']))]

Unnamed: 0,A,B
3,,
24,,
34,,
36,,
41,,
...,...,...
9940,,
9970,,
9974,,
9977,,


In [14]:
synthetic_data

Unnamed: 0,A,B
0,17.378601,18.709923
1,6.826034,11.113109
2,5.841935,8.204392
3,0.866829,5.241111
4,11.676369,15.586211
...,...,...
9995,10.708381,15.347991
9996,0.735426,2.258423
9997,7.347542,11.652349
9998,16.140438,17.545866


In [5]:
synthetic_data.loc[synthetic_data['A']<synthetic_data['B']]

Unnamed: 0,A,B
1,1.9,3.0
5,1.4,2.8
11,2.0,3.0
12,0.7,2.1
15,1.8,3.0
...,...,...
9984,2.0,3.0
9987,1.6,2.9
9988,0.1,1.6
9994,1.7,3.0


In [15]:
import pandas as pd
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

# Your DataFrame (replace this with your actual DataFrame)
data = {'a': [1, 2, 3, 4, 5], 'b': [6, 7, 8, 9, 10]}
df = pd.DataFrame(data)

# Define the probability of a value being replaced with NaN
nan_probability = 0.2

# Create a mask with the same shape as the DataFrame, with True values indicating where NaNs should be added
mask = np.random.choice([True, False], size=df.shape, p=[nan_probability, 1 - nan_probability])

# Apply the mask to the DataFrame, replacing values with NaN where the mask is True
df = df.mask(mask)

print(df)


     a     b
0  1.0   6.0
1  2.0   7.0
2  NaN   NaN
3  NaN   9.0
4  5.0  10.0


In [26]:
import pandas as pd
import numpy as np

def get_nan_component_value(row):
    """
    Function that checks for NaNs in a pandas row and outputs a concatenated string of the column names with NaNs.

    :param row: A pandas row (Series)
    :return: A concatenated string of the column names with NaNs
    """
    # Initialize an empty string to store column names with NaNs
    columns_with_nans = ""

    # Iterate through the row's columns and their corresponding values
    for column, value in row.items():
        # Check if the value is NaN using pandas' isna() function
        if pd.isna(value):
            # If the value is NaN, add the column name to the columns_with_nans string
            columns_with_nans += f"{column}, "

    # Remove the trailing comma and space from the string
    columns_with_nans = columns_with_nans.rstrip(", ")

    return columns_with_nans

# Example usage:
data = {
    'A': [1, np.nan, 3, 4],
    'B': [np.nan, np.nan, 3, 4],
    'C': [1, 2, np.nan, 4],
    'D': [1, 2, 3, np.nan]
}
df = pd.DataFrame(data)

for index, row in df.iterrows():
    print(f"Row {index}: {get_nan_component_value(row)}")


Row 0: B
Row 1: A, B
Row 2: C
Row 3: D


In [27]:
from itertools import combinations

# Your list of strings
string_list = ["a", "b", "c", "d"]

# Initialize an empty list to store the combinations
all_combinations = []

# Generate all possible combinations of the strings
for i in range(1, len(string_list) + 1):
    for combo in combinations(string_list, i):
        all_combinations.append("".join(combo))

# Print the list of all possible combinations
print(all_combinations)


['a', 'b', 'c', 'd', 'ab', 'ac', 'ad', 'bc', 'bd', 'cd', 'abc', 'abd', 'acd', 'bcd', 'abcd']


In [28]:
# Your string with multiple commas
input_string = "  text1  , text2 , text3 , text4 ,  text5"

# Split the string by commas
split_strings = input_string.split(",")

# Remove any blank spaces from the substrings
clean_strings = [s.strip() for s in split_strings]

# Print the cleaned substrings
print(clean_strings)


['text1', 'text2', 'text3', 'text4', 'text5']


In [7]:
from sdv.multi_table import HMASynthesizer
from sdv.datasets.demo import download_demo

dataset, metadata = download_demo('multi_table', 'fake_hotels')

hmas = HMASynthesizer(metadata)

hmas.fit(dataset)



In [30]:
data = pd.DataFrame(data={
    'A': [0, 1, np.nan, np.nan],
    'B': [2, np.nan, 3, np.nan]
    })

metadata = SingleTableMetadata.load_from_dict({
    'columns': {
        'A': { 'sdtype': 'numerical' },
        'B': { 'sdtype': 'numerical'},
    }
})

synthesizer = GaussianCopulaSynthesizer(metadata)

synthesizer.add_constraints([{
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'A',
        'high_column_name': 'B',
    }
}])

synthesizer.fit(data)
synthetic_data = synthesizer.sample(100000)

# there are no cases where A is null but B is present
print((~(pd.isna(synthetic_data['A'])) & ~(pd.isna(synthetic_data['B']))).any())


Sampling rows: 100%|██████████| 100000/100000 [00:00<00:00, 258204.60it/s]

False





In [31]:
synthetic_data[~(pd.isna(synthetic_data['A'])) & ~(pd.isna(synthetic_data['B']))]

Unnamed: 0,A,B


In [26]:
print(((pd.isna(synthetic_data['A'])) & (pd.isna(synthetic_data['B']))).any())

True


In [25]:
synthetic_data

Unnamed: 0,A,B
0,,3.0
1,,3.0
2,,
3,,3.0
4,,3.0
...,...,...
9995,,3.0
9996,,
9997,,3.0
9998,,3.0


In [None]:
assert ((pd.isna(synthetic_data['A'])) & ~(pd.isna(synthetic_data['B']))).any()
assert (~(pd.isna(synthetic_data['A'])) & (pd.isna(synthetic_data['B']))).any()
assert (~(pd.isna(synthetic_data['A'])) & ~(pd.isna(synthetic_data['B']))).any()

In [9]:
sampled['checkin_date'].min()

'01 Aug 2020'

In [11]:
sampled['checkin_date'].mean()

TypeError: Could not convert 28 Aug 202025 Mar 202005 Jul 202007 Jan 202016 Jul 202023 Apr 202029 May 202014 Jun 202031 Oct 202016 Sep 202007 Jun 202026 Jan 202028 Mar 202004 Mar 202022 Jun 202011 May 202019 Sep 202029 Sep 202006 Jan 202125 Jun 202015 May 202010 May 202005 Feb 202024 Feb 202019 Jan 202008 Oct 202020 Apr 202004 Jan 202122 Apr 202004 Jul 202029 Feb 202018 Oct 202010 Feb 202010 Dec 202015 Sep 202012 Oct 202015 Nov 202016 Sep 202004 Apr 202015 May 202005 Dec 202018 Feb 202017 Apr 202030 Mar 202006 Jun 202019 Dec 202024 Feb 202029 Jun 202030 Oct 202027 Apr 202021 Jul 202020 Sep 202008 Aug 202017 May 202030 Mar 202015 May 202013 Jun 202021 Feb 202030 Nov 202005 Oct 202007 Nov 202004 Jan 202118 Nov 202007 Nov 202024 May 202027 Jun 202013 Sep 202031 Jan 202016 Dec 202031 Mar 202005 Jun 202010 May 202021 Sep 202004 Dec 202010 Aug 202007 Nov 202029 Jun 202006 Dec 202005 Dec 202027 Jun 202026 Oct 202004 Aug 202030 Jul 202025 Nov 202026 Mar 202014 Jan 202016 May 202006 Jul 202001 Sep 202004 Jan 202112 Dec 202030 Mar 202030 Nov 202027 Feb 202008 Feb 202012 May 202020 Jul 202010 Mar 202012 Oct 202012 Jan 202012 Oct 202022 Sep 202006 Aug 202029 Dec 202002 Jun 202016 Jan 202019 Sep 202014 Jun 202002 Nov 202011 Sep 202020 Jun 202023 Aug 202015 Dec 202013 Oct 202011 Aug 202017 Sep 202026 May 202020 Feb 202029 Mar 202020 Oct 202023 Dec 202006 Dec 202019 Jul 202006 Dec 202008 Feb 202021 May 202031 Jul 202012 Nov 202024 Sep 202014 Aug 202012 Apr 202025 Feb 202013 Apr 202030 Aug 202027 Aug 202027 Oct 202021 Jun 202018 Jul 202003 May 202017 Aug 202009 Oct 202021 Jun 202018 Dec 202012 Jul 202028 Jun 202020 Dec 202019 Feb 202013 Jun 202003 Sep 202011 Nov 202001 May 202005 Jan 202030 Aug 202009 Apr 202027 Mar 202020 Jan 202015 Sep 202006 Jan 202027 Sep 202017 Oct 202024 Feb 202003 Jan 202105 May 202031 Jan 202027 Feb 202013 Jun 202025 Mar 202002 Aug 202024 Mar 202029 Jun 202030 Oct 202012 Nov 202029 Dec 202030 May 202026 Dec 202020 Oct 202009 Jul 202026 Jan 202020 Apr 202024 Jul 202005 Nov 202009 Nov 202021 Oct 202029 May 202021 Sep 202017 Aug 202014 Mar 202015 Nov 202018 Apr 202017 Aug 202020 Aug 202012 Sep 202017 Oct 202001 Aug 202001 Nov 202013 Dec 202006 Jan 202128 Jul 202015 Mar 202027 Mar 202002 May 202024 Aug 202002 Jun 202030 Aug 202022 Mar 202024 Apr 202004 Dec 202001 Nov 202027 Apr 202011 Apr 202024 Nov 202006 Jun 202031 Dec 202010 Apr 202013 Jun 202006 Feb 202010 Oct 202006 Oct 202017 Sep 202016 Mar 202023 Feb 202008 Feb 202011 Oct 202021 Jul 202013 Sep 202030 Oct 202024 Aug 202031 Oct 202018 Jun 202022 Sep 202015 Mar 202007 Jun 202003 Dec 202012 Apr 202006 Apr 202016 May 202018 Apr 202005 Jan 202010 Aug 202027 Jun 202009 Jan 202006 Jun 202030 Sep 202027 Sep 202030 May 202004 Jun 202028 Apr 202016 Nov 202001 Jul 202024 Oct 202003 Jan 202121 Jul 202030 May 202011 Sep 202017 Jun 202031 Jul 202008 Mar 202018 Mar 202013 Apr 202020 May 202019 May 202007 Dec 202028 Mar 202005 Apr 202010 Oct 202018 Mar 202030 Nov 202008 Mar 202011 Feb 202014 Aug 202016 Oct 202021 Oct 202026 Sep 202021 Feb 202025 Mar 202002 Apr 202016 Nov 202013 Jan 202003 Sep 202022 Sep 202009 May 202019 May 202008 Jun 202020 Jan 202019 Jan 202019 Apr 202012 Mar 202002 Aug 202010 Nov 202027 Jan 202009 Feb 202029 Dec 202020 Feb 202026 Jul 202024 Dec 202014 Nov 202010 Dec 202026 Sep 202015 Sep 202020 Oct 202005 Jan 202114 Jan 202025 May 202015 Nov 202030 Aug 202028 Aug 202019 May 202014 Oct 202019 May 202020 Oct 202030 Apr 202001 Dec 202025 Oct 202009 Apr 202001 Dec 202010 Dec 202025 Sep 202029 Jan 202022 Nov 202027 Dec 202022 Aug 202004 Aug 202003 Nov 202010 Nov 202023 Mar 202019 Nov 202002 Jan 202129 Apr 202026 Nov 202015 Jul 202017 Oct 202029 Jun 202030 Apr 202023 Apr 202011 Jun 202022 Nov 202028 May 202023 Mar 202008 Aug 202005 Apr 202024 Jul 202020 Oct 202022 Nov 202017 Sep 202014 Dec 202019 Nov 202021 Jan 202015 Feb 202025 Jun 202013 Nov 202021 Mar 202014 Nov 202004 Mar 202022 Jun 202026 Sep 202010 Mar 202025 Oct 202025 Jul 202016 Jan 202031 Aug 202010 Apr 202006 Apr 202001 Dec 202019 Oct 202008 Oct 202024 May 202020 Oct 202024 Sep 202030 Mar 202012 Aug 202028 Dec 202027 Dec 202021 Jul 202005 Sep 202012 Aug 202003 Aug 202015 Dec 202004 May 202030 Jan 202014 Sep 202010 Aug 202006 Apr 202026 Apr 202008 Aug 202005 Mar 202001 Jun 202023 Aug 202003 May 202001 Jun 202004 Mar 202017 Aug 202015 Sep 202025 Sep 202016 Apr 202012 Apr 202019 Jul 202004 Aug 202002 Jun 202015 Oct 202009 Aug 202006 Jun 202026 Apr 202022 Sep 202017 Sep 202003 Feb 202017 Nov 202026 Apr 202002 May 202020 Oct 202021 Aug 202014 Oct 202029 Oct 202028 Sep 202004 Mar 202005 Jan 202104 Mar 202019 Aug 202030 Jan 202025 Aug 202015 Feb 202018 Nov 202006 Oct 202011 Apr 202016 Mar 202023 Oct 202005 Sep 202024 May 202016 Dec 202021 May 202025 Oct 202030 Apr 202016 Apr 202009 Nov 202030 Dec 202021 Mar 202006 Apr 202014 Jun 202025 Dec 202008 May 202029 Sep 202026 Aug 202004 Jun 202029 Nov 202026 Oct 202007 Jul 202003 Nov 202006 Jan 202001 Aug 202027 Dec 202024 Sep 202005 Jan 202125 Mar 202007 Feb 202025 Aug 202006 Nov 202015 Oct 202030 Dec 202008 Aug 202027 Nov 202012 Dec 202004 Dec 202006 Oct 202007 Aug 202002 Nov 202016 Jan 202006 Apr 202010 Nov 202012 Jul 202017 Dec 202026 Sep 202023 Feb 202021 Apr 202025 Jul 202027 Dec 202006 Nov 202017 Sep 202019 Dec 202002 Nov 202017 Jun 202004 Jun 202016 Sep 202011 Aug 202010 Aug 202017 May 202015 Jun 202005 Dec 202010 Oct 202010 Jul 202017 Aug 202014 Sep 202008 Feb 202018 Jan 202017 Aug 202026 Feb 202006 Dec 202031 Mar 202029 Jun 202017 Dec 202001 Jul 202020 Nov 2020 to numeric

In [10]:
sampled['checkin_date'].max()

'31 Oct 2020'

In [3]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)
synthesizer = GaussianCopulaSynthesizer(metadata)
checkin_lessthan_checkout = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'checkin_date',
        'high_column_name': 'checkout_date'
    }
}

synthesizer.add_constraints([checkin_lessthan_checkout])
synthesizer.fit(real_data)

# Run and Assert
sampled = synthesizer.sample(num_rows=500)
synthesizer.validate(sampled)
_sampled = sampled[~sampled['checkout_date'].isna()]
assert all(
    pd.to_datetime(_sampled['checkin_date']) < pd.to_datetime(_sampled['checkout_date'])
)

Sampling rows: 100%|██████████| 500/500 [00:00<00:00, 2814.22it/s]


In [1]:
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

data = pd.DataFrame(data={
    'low':    [1, 4, np.nan, 0,      4,      np.nan, np.nan, 5,      np.nan],
    'middle': [2, 5, 3,      np.nan, 5,      np.nan, 5,      np.nan, np.nan],
    'high':   [3, 7, 8,      4,      np.nan, 9,      np.nan, np.nan, np.nan]
})

metadata_dict = {
    'columns': {
        'low': { 'sdtype': 'numerical' },
        'middle': { 'sdtype': 'numerical'},
        'high': { 'sdtype': 'numerical'}
    }
}

metadata = SingleTableMetadata.load_from_dict(metadata_dict)
synth = GaussianCopulaSynthesizer(metadata)

my_constraint = {
    'constraint_class': 'Range',
    'constraint_parameters': {
        'low_column_name': 'low',
        'middle_column_name': 'middle',
        'high_column_name': 'high'
    }
}

synth.add_constraints(constraints=[my_constraint])
synth.fit(data)

synth_data = synth.sample(2000)
synth_data[synth_data['low'] > synth_data['high']]

Sampling rows: 100%|██████████| 2000/2000 [00:00<00:00, 43737.82it/s]


Unnamed: 0,low,middle,high


In [2]:
synth_data

Unnamed: 0,low,middle,high
0,,5.0,
1,,5.0,
2,,4.0,6.0
3,3.0,,
4,,4.0,
...,...,...,...
1995,5.0,,
1996,2.0,4.0,6.0
1997,,4.0,
1998,3.0,4.0,


In [9]:
synth_data[~(pd.isna(synth_data['low'])) & ~(pd.isna(synth_data['middle'])) & ~(pd.isna(synth_data['high']))]

Unnamed: 0,low,middle,high
0,,5.0,
1,,5.0,
4,,4.0,
5,,2.0,
10,,2.0,
...,...,...,...
1968,,5.0,
1970,,4.0,
1974,,5.0,
1984,,5.0,


In [7]:
from sdv.constraints.utils import compute_nans_column
compute_nans_column(data, ['low', 'middle', 'high'])
print(data)

   low  middle  high low#middle#high.nan_component
0  1.0     2.0   3.0                          None
1  4.0     5.0   7.0                          None
2  NaN     3.0   8.0                           low
3  0.0     NaN   4.0                        middle
4  4.0     5.0   NaN                          high
5  NaN     NaN   9.0                   low, middle
6  NaN     5.0   NaN                     low, high
7  5.0     NaN   NaN                  middle, high
8  NaN     NaN   NaN             low, middle, high
