# Synthetic Distribution Logic
Set by a Senior data scientist and researcher to test the logic of single and joint distributions. This applies to both the preparation of features for machine learning and distribution techniques for synthetic data. These logic tests were considered challenges to the data science team in the preparation of their data for comsumption into their models.


In [1]:
import pandas as pd
from ds_discovery import Wrangle

In [2]:
wr = Wrangle.from_memory()
tools = wr.tools

## Logic Tests
1. (A AND B) OR C
2. !A AND B
3. !(A AND B)
4. A AND !B
5. (A OR B) AND (C OR D)


In [3]:
df = pd.DataFrame()

In [4]:
df['s1'] = pd.Series(list('AAAABBBBCCCCDDDD'))
df['s2'] = pd.Series(list('ABCDABCDABCDABCD'))
df['s3'] = pd.Series([1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8])

-------------------
### (A AND B) OR C

### Single column

In [5]:
A = tools.select2dict(column='s3', condition="(@ > 2)", logic='AND')
B = tools.select2dict(column='s3', condition="(@ < 5)", logic='AND')
C = tools.select2dict(column='s3', condition="@ == 8", logic='OR')

selection = [[A, B], C]

df['l1'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [6]:
df[df['l1'] == 1].loc[:,['s3']]

Unnamed: 0,s3
2,3
3,4
7,8
10,3
11,4
15,8


### Multi column

In [7]:
A = tools.select2dict(column='s1', condition="@ == 'A'")
B = tools.select2dict(column='s2', condition="@ == 'B'", logic='AND')
C = tools.select2dict(column='s1', condition="@ == 'C'", logic='OR')

selection = [[A, B], C]

df['l1'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [8]:
df[df['l1'] == 1].loc[:,['s1','s2']]

Unnamed: 0,s1,s2
1,A,B
8,C,A
9,C,B
10,C,C
11,C,D


-------------------
### !A AND B

### Single column

In [9]:
A = tools.select2dict(column='s3', condition="@ == 7", logic='NOT')
B = tools.select2dict(column='s3', condition="@ > 4", logic='AND')

selection = [A, B]

df['l2'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [10]:
df[df['l2'] == 1].loc[:,['s3']]

Unnamed: 0,s3
4,5
5,6
7,8
12,5
13,6
15,8


### Multi column

In [11]:
A = tools.select2dict(column='s1', condition="@ == 'A'", logic='NOT')
B = tools.select2dict(column='s2', condition="@ == 'B'", logic='AND')

selection = [A, B]

df['l2'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [12]:
df[df['l2'] == 1].loc[:,['s1', 's2']]

Unnamed: 0,s1,s2
5,B,B
9,C,B
13,D,B


-------------------
### !(A AND B)

### Single column

In [13]:
A = tools.select2dict(column='s3', condition="@ < 8")
B = tools.select2dict(column='s3', condition="@ > 3", logic='AND')

selection = [[A, B], 'NOT']

df['l1'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [14]:
df[df['l1'] == 1].loc[:,['s3']]

Unnamed: 0,s3
0,1
1,2
2,3
7,8
8,1
9,2
10,3
15,8


### Multi column

In [15]:
A = tools.select2dict(column='s1', condition="@ == 'A'")
B = tools.select2dict(column='s2', condition="@ == 'B'", logic='AND')

selection = selection = [[A, B], 'NOT']

df['l3'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [16]:
df[df['l3'] == 1].loc[:,['s1','s2']]

Unnamed: 0,s1,s2
0,A,A
2,A,C
3,A,D
4,B,A
5,B,B
6,B,C
7,B,D
8,C,A
9,C,B
10,C,C


-------------------
### A AND !B

### Single column

In [17]:
A = tools.select2dict(column='s3', condition="@ > 5")
B = tools.select2dict(column='s3', condition="@ == 7", logic='NOT')

selection = [A, B]

df['l1'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [18]:
df[df['l1'] == 1].loc[:,['s3']]

Unnamed: 0,s3
5,6
7,8
13,6
15,8


### Multi column

In [19]:
A = tools.select2dict(column='s1', condition="@ == 'A'")
B = tools.select2dict(column='s2', condition="@ == 'B'", logic='NOT')

selection = [A, B]

df['l4'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [20]:
df[df['l4'] == 1].loc[:,['s1','s2']]

Unnamed: 0,s1,s2
0,A,A
2,A,C
3,A,D


-------------------
### (A OR B) AND (C OR D)

### Single column

In [21]:
A = tools.select2dict(column='s3', condition="(@ < 3)")
B = tools.select2dict(column='s3', condition="(@ > 5)", logic='OR')
C = tools.select2dict(column='s3', condition="@ == 2")
D = tools.select2dict(column='s3', condition="@ > 7", logic='OR')

selection = [[A, B], 'AND', [C, D]]

df['l1'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [22]:
df[df['l1'] == 1].loc[:,['s3']]

Unnamed: 0,s3
1,2
7,8
9,2
15,8


### Multi column

In [23]:
A = tools.select2dict(column='s1', condition="@ == 'A'")
B = tools.select2dict(column='s2', condition="@ == 'B'", logic='OR')
C = tools.select2dict(column='s1', condition="@ == 'C'")
D = tools.select2dict(column='s2', condition="@ == 'D'", logic='OR')

selection = [[A, B], 'AND', [C, D]]

df['l4'] = tools.correlate_selection(df, selection=selection, action=1, default_action=0)

In [24]:
df[df['l4'] == 1].loc[:,['s1','s2']]

Unnamed: 0,s1,s2
3,A,D
9,C,B
