In [2]:
#load necessary packages and functions
import scipy.stats as stats
import numpy as np

def get_cramer_v(data):
    #Chi-squared test statistic, sample size, and minimum of rows and columns
    x2 = stats.chi2_contingency(data, correction=False)[0]
    n = np.sum(data)
    minDim = min(data.shape)-1

    #calculate Cramer's V 
    v = np.sqrt((x2/n) / minDim)
    return v

def prep_data(fields, protecteds, protected_type):
    if protected_type == 'age':
        data = np.zeros((6, max(fields) + 1))
    elif protected_type == 'german-age':
        data = np.zeros((5, max(fields) + 1))
    elif protected_type == 'nba-age':
        data = np.zeros((5, max(fields) + 1))
    else:
        data = np.zeros((2, max(fields) + 1))
    for i, protected in enumerate(protecteds):
        field = fields[i]
        if field == -1:
            continue

        if protected_type == 'gender':
            data[protected][field] += 1
        elif protected_type == 'region':
            data[protected][field] += 1
        elif protected_type == 'age':
            if protected == 0:
                continue
            if protected < 18:
                data[0][field] += 1
            elif protected < 25:
                data[1][field] += 1
            elif protected < 33:
                data[2][field] += 1
            elif protected < 41:
                data[3][field] += 1
            elif protected < 61:
                data[4][field] += 1
            else:
                data[5][field] += 1
        elif protected_type == 'german-age':
            if protected == 0:
                continue
            if protected < 25:
                data[0][field] += 1
            elif protected < 33:
                data[1][field] += 1
            elif protected < 41:
                data[2][field] += 1
            elif protected < 61:
                data[3][field] += 1
            else:
                data[4][field] += 1
        elif protected_type == 'nba-age':
            if protected == 0:
                continue
            if protected < 22:
                data[0][field] += 1
            elif protected < 26:
                data[1][field] += 1
            elif protected < 30:
                data[2][field] += 1
            elif protected < 33:
                data[3][field] += 1
            else:
                data[4][field] += 1
            
    return data

In [3]:
import numpy as np
import os

pokec_dir = '../../dataset/pokec/'

print(os.listdir(pokec_dir))

lines = []
with open(pokec_dir + 'region_job.csv', 'r') as csv_file:
    lines = csv_file.readlines()

gender_idx = 3
region_idx = 4
age_idx = 5
working_idx = 6

genders = []
regions = []
ages = []
fields = []
for line in lines[1:]:
    gender = line.split(',')[gender_idx]
    region = line.split(',')[region_idx]
    age = line.split(',')[age_idx]
    working = line.split(',')[working_idx]
    
    genders.append(int(float(gender)))
    regions.append(int(region))
    ages.append(int(float(age)))
    fields.append(int(working))

data = prep_data(fields, genders, 'gender')
v = get_cramer_v(data)
print(v)

data = prep_data(fields, regions, 'region')
v = get_cramer_v(data)
print(v)

data = prep_data(fields, ages, 'age')
v = get_cramer_v(data)
print(v)


lines = []
with open(pokec_dir + 'region_job_2.csv', 'r') as csv_file:
    lines = csv_file.readlines()

gender_idx = 3
region_idx = 4
age_idx = 5
working_idx = 6

genders = []
regions = []
ages = []
fields = []
for line in lines[1:]:
    gender = line.split(',')[gender_idx]
    region = line.split(',')[region_idx]
    age = line.split(',')[age_idx]
    working = line.split(',')[working_idx]
    
    genders.append(int(float(gender)))
    regions.append(int(region))
    ages.append(int(float(age)))
    fields.append(int(working))

data = prep_data(fields, genders, 'gender')
v = get_cramer_v(data)
print(v)

data = prep_data(fields, regions, 'region')
v = get_cramer_v(data)
print(v)

data = prep_data(fields, ages, 'age')
v = get_cramer_v(data)
print(v)

['region_job.csv', 'region_job.embedding', 'region_job_2.embedding', 'region_job_2.csv', 'region_job_relationship.txt', 'region_job_2_relationship.txt']
0.3133047527108656
0.09906574942421885
0.36227034843369893
0.08110308625543301
0.06519458565784995
0.40000653186331864


In [4]:
nifty_root = '/fs/class-projects/fall2021/cmsc742/c742g002/NiftyGNN/dataset'

for dataset in ['bail', 'credit', 'german']:
    
    if dataset == 'bail':
        is_whites = []
        is_males = []
        ages = []
        is_recs = []
        with open(nifty_root + '/' + dataset + '/' + dataset + '.csv', 'r') as csv_file:
            lines = csv_file.readlines()
            for line in lines[1:]:
                line = line.strip().split(',')
                is_white = line[0]
                is_male = line[9]
                age = line[13]
                is_rec = line[16]
                is_whites.append(int(is_white))
                is_males.append(int(is_male))
                ages.append(int(int(age) / 12))
                is_recs.append(int(is_rec))

        data = prep_data(is_recs, is_whites, 'gender')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(is_recs, is_males, 'gender')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(is_recs, ages, 'age')
        v = get_cramer_v(data)
        print(v)
    
    elif dataset == 'credit':
        is_marrieds = []
        is_singles = []
        ages = []
        no_defaults = []
        with open(nifty_root + '/' + dataset + '/' + dataset + '.csv', 'r') as csv_file:
            lines = csv_file.readlines()
            for line in lines[1:]:
                line = line.strip().split(',')
                is_married = line[1]
                is_single = line[2]
                age = line[3]
                no_default = line[0]
                is_marrieds.append(int(is_married))
                is_singles.append(int(is_single))
                ages.append(int(age))
                no_defaults.append(int(no_default))

        data = prep_data(no_defaults, is_marrieds, 'gender')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(no_defaults, is_singles, 'region')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(no_defaults, ages, 'gender')
        v = get_cramer_v(data)
        print(v)
        
    elif dataset == 'german':
        genders = []
        foreigns = []
        singles = []
        ages = []
        no_defaults = []
        with open(nifty_root + '/' + dataset + '/' + dataset + '.csv', 'r') as csv_file:
            lines = csv_file.readlines()
            for line in lines[1:]:
                line = line.strip().split(',')
                gender = line[1]
                if gender == 'Male':
                    gender = 1
                else:
                    gender = 0
                foreign = line[2]
                single = line[3]
                age = line[4]
                no_default = line[0]
                if no_default == '-1':
                    no_default = 0
                genders.append(int(gender))
                foreigns.append(int(foreign))
                singles.append(int(single))
                ages.append(int(age))
                no_defaults.append(int(no_default))

        data = prep_data(no_defaults, genders, 'gender')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(no_defaults, foreigns, 'gender')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(no_defaults, singles, 'gender')
        v = get_cramer_v(data)
        print(v)

        data = prep_data(no_defaults, ages, 'german-age')
        v = get_cramer_v(data)
        print(v)
        

0.07732603732141628
0.07593668480808283
0.11847959577164041
0.02977461041246698
0.030619384748382845
0.038283323868946235
0.07549269735627623
0.08207949878149655
0.08067680281606497
0.1210009979120393


In [5]:
import numpy as np
import os

nba_dir = '../../dataset/NBA/'

print(os.listdir(nba_dir))

lines = []
with open(nba_dir + 'nba.csv', 'r') as csv_file:
    lines = csv_file.readlines()
    
print(lines[0].strip())
print(lines[1].strip())

salary_idx = 1
age_idx = 2
country_idx =37

salaries = []
ages = []
countries = []
for line in lines[1:]:
    salary = line.split(',')[salary_idx]
    age = line.split(',')[age_idx]
    country = line.split(',')[country_idx]
    
    if salary == '-1':
        salary = 0
    
    salaries.append(int(salary))
    ages.append(int(float(age)))
    countries.append(int(country))

data = prep_data(salaries, countries, 'gender')
v = get_cramer_v(data)
print(v)

data = prep_data(salaries, ages, 'nba-age')
v = get_cramer_v(data)
print(v)

['nba.csv', 'nba_relationship.txt', 'nba.embedding']
user_id,SALARY,AGE,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF_x,POINTS,GP,MPG,ORPM,DRPM,RPM,WINS_RPM,PIE,PACE,W,player_height,player_weight,country,C,PF_y,PF-C,PG,SF,SG,ATL,ATL/CLE,ATL/LAL,BKN,BKN/WSH,BOS,CHA,CHI,CHI/OKC,CLE,CLE/DAL,CLE/MIA,DAL,DAL/BKN,DAL/PHI,DEN,DEN/CHA,DEN/POR,DET,GS,GS/CHA,GS/SAC,HOU,HOU/LAL,HOU/MEM,IND,LAC,LAL,MEM,MIA,MIL,MIL/CHA,MIN,NO,NO/DAL,NO/MEM,NO/MIL,NO/MIN/SAC,NO/ORL,NO/SAC,NY,NY/PHI,OKC,ORL,ORL/TOR,PHI,PHI/OKC,PHX,POR,SA,SAC,TOR,UTAH,WSH
105305397,-1,25,14.8,2.7,5.6,0.48700000000000004,0.8,2.1,0.374,1.9,3.5,0.556,0.5579999999999999,0.6,0.8,0.759,0.3,1.3,1.6,1.2,0.5,0.1,0.7,1.0,6.8,77,14.8,-1.08,-1.82,-2.9,0.17,9.2,100.62,63,190.5,79.3786,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.02547535468070778
0.4221452841914473
