### Install

In [48]:
#pip install ucimlrepo

In [49]:
#pip install psycopg2

### Imports

In [68]:
# Imports
from ucimlrepo import fetch_ucirepo
import pandas as pd
import psycopg2
from collections import defaultdict

### Fetching data

In [51]:
# fetch dataset
census_income = fetch_ucirepo(id=20)

# data (as pandas dataframes)
X = census_income.data.features
y = census_income.data.targets

# metadata
print(census_income.metadata)

# variable information
print(census_income.variables)


{'uci_id': 20, 'name': 'Census Income', 'repository_url': 'https://archive.ics.uci.edu/dataset/20/census+income', 'data_url': 'https://archive.ics.uci.edu/static/public/20/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data.  Also known as Adult dataset.', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5GP7S', 'creators': ['Ron Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && 

In [52]:
print(X)

       age         workclass  fnlwgt  education  education-num  \
0       39         State-gov   77516  Bachelors             13   
1       50  Self-emp-not-inc   83311  Bachelors             13   
2       38           Private  215646    HS-grad              9   
3       53           Private  234721       11th              7   
4       28           Private  338409  Bachelors             13   
...    ...               ...     ...        ...            ...   
48837   39           Private  215419  Bachelors             13   
48838   64               NaN  321403    HS-grad              9   
48839   38           Private  374983  Bachelors             13   
48840   44           Private   83891  Bachelors             13   
48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation    relationship  \
0           Never-married       Adm-clerical   Not-in-family   
1      Married-civ-spouse    Exec-managerial         Husband   
2              

### Preprocess data

In [53]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [54]:
X.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [55]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


In [56]:
X.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [57]:
X=X.dropna()

In [58]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47621 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             47621 non-null  int64 
 1   workclass       47621 non-null  object
 2   fnlwgt          47621 non-null  int64 
 3   education       47621 non-null  object
 4   education-num   47621 non-null  int64 
 5   marital-status  47621 non-null  object
 6   occupation      47621 non-null  object
 7   relationship    47621 non-null  object
 8   race            47621 non-null  object
 9   sex             47621 non-null  object
 10  capital-gain    47621 non-null  int64 
 11  capital-loss    47621 non-null  int64 
 12  hours-per-week  47621 non-null  int64 
 13  native-country  47621 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.4+ MB


In [59]:
X['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [60]:
for col in X.columns:
  X=X[X[col]!='?']
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45222 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             45222 non-null  int64 
 1   workclass       45222 non-null  object
 2   fnlwgt          45222 non-null  int64 
 3   education       45222 non-null  object
 4   education-num   45222 non-null  int64 
 5   marital-status  45222 non-null  object
 6   occupation      45222 non-null  object
 7   relationship    45222 non-null  object
 8   race            45222 non-null  object
 9   sex             45222 non-null  object
 10  capital-gain    45222 non-null  int64 
 11  capital-loss    45222 non-null  int64 
 12  hours-per-week  45222 non-null  int64 
 13  native-country  45222 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


In [61]:
col_names = (X.columns).values
mask = col_names != 'marital-status'
col_names = col_names[mask]

In [62]:
X['marital-status'].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [63]:
m = ['Married-civ-spouse', ' Widowed', 'Married-spouse-absent', 'Married-AF-spouse', ' Separated']
u = ['Divorced','Never-married']
for married in m:
    X['marital-status'] = X['marital-status'].replace(married,"Married")
for unmarried in u:
    X['marital-status'] = X['marital-status'].replace(unmarried,"Unmarried")
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Unmarried,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Unmarried,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [64]:
X.to_csv('cleandata.csv', encoding='utf-8', index=False)

In [65]:
connection = psycopg2.connect(database="Project645", user="vaishnavishah", password="10jan2001", host="localhost", port=5432)

cursor = connection.cursor()


In [66]:
cursor.execute("SELECT * from unmarried;")
record = cursor.fetchall()
unmarried_df = pd.DataFrame(record, columns=col_names)

In [67]:
cursor.execute("SELECT * from married;")
record = cursor.fetchall()
married_df = pd.DataFrame(record, columns=col_names)

In [71]:
results_kl = defaultdict(list)

In [72]:
views = []
aggregate = ["COUNT", "AVG", "SUM", "MIN", "MAX"]  
discrete = ["age", "workclass"]  
continuous = ["hours_per_week", "capital_gain"]  

In [73]:
for f in aggregate:
    for a in discrete:
        for m in continuous:
            views.append((a, m, f))

In [76]:
for a, m, f in views:
    q1 = f'select {a}, {f}({m}) from married where {a} is not null group by {a} order by {a};'
    q2 = f'select {a}, {f}({m}) from unmarried where {a} is not null group by {a} order by {a};'

    cursor.execute(q1)
    res_1 = cursor.fetchall()

    cursor.execute(q2)
    res_2 = cursor.fetchall()

    dict_res = defaultdict(list)
    f_list_1 = []
    f_list_2 = []

    for k, v in res_1:
        if v != 0:
            dict_res[k].append(v)
        else:
            dict_res[k].append(1e-10)

    for k, v in res_2:
        if k not in dict_res:
            dict_res[k].append(1e-10)
        if v != 0:
            dict_res[k].append(v)
        else:
            dict_res[k].append(1e-10)

    for k, v in dict_res.items():
        if len(v) != 2:
            dict_res[k].append(1e-10)

    for i in dict_res.values():
        f_list_1.append(i[0])
        f_list_2.append(i[1])


    sum_1 = sum(f_list_1)
    sum_2 = sum(f_list_2)
    nf_1 = [i / sum_1 for i in f_list_1]
    nf_2 = [i / sum_2 for i in f_list_2]

    
    kl_divergence = pd.Series(nf_1).entropy(pd.Series(nf_2))
    

    results_kl[(a, m, f)].append(kl_divergence)


InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [None]:
k = 5
top_k_views = sorted(dict_kl.items(), key=lambda item: item[1], reverse=True)[:k]


In [None]:
for view, utility in top_k_views:
    print("View:", view)
    print("Utility:", utility)

In [None]:
connection.close()