In [1]:
# !pip install psycopg2

In [25]:
import psycopg2
import numpy as np
from scipy.special import rel_entr
from scipy.special import kl_div
from collections import defaultdict
# Making connection to postgres database
conn = psycopg2.connect(database='SeeDB', user='postgres', password='password')

cur = conn.cursor()
cur.execute('SELECT * FROM census LIMIT 5')
head_5 = cur.fetchall()

print("First 5 rows of census table:")
for row in head_5:
    print(row)

cur.execute('SELECT COUNT(*) FROM census')
total_rows = cur.fetchone()[0]
print("Total rows in census table:", total_rows)

# conn.close()


First 5 rows of census table:
(1, 39, ' State-gov', 77516, ' Bachelors', 13, ' Never-married', ' Adm-clerical', ' Not-in-family', ' White', ' Male', 2174, 0, 40, ' United-States', ' <=50K')
(2, 50, ' Self-emp-not-inc', 83311, ' Bachelors', 13, ' Married-civ-spouse', ' Exec-managerial', ' Husband', ' White', ' Male', 0, 0, 13, ' United-States', ' <=50K')
(3, 38, ' Private', 215646, ' HS-grad', 9, ' Divorced', ' Handlers-cleaners', ' Not-in-family', ' White', ' Male', 0, 0, 40, ' United-States', ' <=50K')
(4, 53, ' Private', 234721, ' 11th', 7, ' Married-civ-spouse', ' Handlers-cleaners', ' Husband', ' Black', ' Male', 0, 0, 40, ' United-States', ' <=50K')
(5, 28, ' Private', 338409, ' Bachelors', 13, ' Married-civ-spouse', ' Prof-specialty', ' Wife', ' Black', ' Female', 0, 0, 40, ' Cuba', ' <=50K')
Total rows in census table: 32561


In [26]:
def kl_divergence(prob1, prob2):
    # return sum(prob1[i] * np.log(prob1[i]/prob2[i]) for i in range(len(prob1)))
    # return sum(kl_div(np.array(prob1), np.array(prob2)))
    # Normalizing prob1 and prob2 values to be between 0 and 1
    prob1 = prob1/np.sum(np.asarray(prob1, dtype=float))
    prob2 = prob2/np.sum(np.asarray(prob2, dtype=float))
    # Clipping values of prob2 between eps and positive infinity to avoid dividing by 0
    prob2 = np.clip(prob2, np.finfo(float).eps, None)
    return np.sum(prob1 * np.log(prob1 / prob2))
# Testing kl divergence
prob1 = [0.25, 0.33, 0.23, 0.19]
prob2 = [0.21, 0.21, 0.32, 0.26]
kl_divergence(prob1, prob2)
# kl_divergence(prob2, prob1)
# kl_divergence(prob1, prob1)

0.057192913458712795

In [27]:
query_ds_cond = "marital_status in (' Married-civ-spouse', ' Married-spouse-absent', ' Married-AF-spouse')"
ref_ds_cond = "marital_status in (' Divorced', ' Never-married', ' Separated', ' Widowed')"

measures = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
dimensions = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
agg_functions = ['avg', 'sum', 'min', 'max', 'count']

In [30]:
views = defaultdict(lambda: defaultdict(set))
# for agg_func in agg_functions:
for dim in dimensions:
    for measure in measures:
        views[dim][measure] = (set(agg_functions))
print(views)

defaultdict(<function <lambda> at 0x00000235917A63E0>, {'workclass': defaultdict(<class 'set'>, {'age': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_gain': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_loss': {'max', 'count', 'sum', 'avg', 'min'}, 'hours_per_week': {'max', 'count', 'sum', 'avg', 'min'}}), 'education': defaultdict(<class 'set'>, {'age': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_gain': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_loss': {'max', 'count', 'sum', 'avg', 'min'}, 'hours_per_week': {'max', 'count', 'sum', 'avg', 'min'}}), 'occupation': defaultdict(<class 'set'>, {'age': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_gain': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_loss': {'max', 'count', 'sum', 'avg', 'min'}, 'hours_per_week': {'max', 'count', 'sum', 'avg', 'min'}}), 'relationship': defaultdict(<class 'set'>, {'age': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_gain': {'max', 'count', 'sum', 'avg', 'min'}, 'capital_loss': {'max', 'count

In [42]:
# Creating 20 partitions of the data
partition_size = total_rows // 20
for i in range(20):
    start_idx = i * partition_size + 1
    end_idx = 1 + ((i + 1) * partition_size - 1 if i != 20 - 1 else total_rows - 1)
    print(start_idx, end_idx)

    # Implement sharing optimization by grouping multiple aggregations in the same query.

1 1628
1629 3256
3257 4884
4885 6512
6513 8140
8141 9768
9769 11396
11397 13024
13025 14652
14653 16280
16281 17908
17909 19536
19537 21164
21165 22792
22793 24420
24421 26048
26049 27676
27677 29304
29305 30932
30933 32561
