# Gráficos

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Dataset_Covid_CE_Anon_Gen_k1.csv')
df2 = pd.read_csv('Dataset_Covid_CE_Anon_Gen_k2.csv')
df4 = pd.read_csv('Dataset_Covid_CE_Anon_Gen_k4.csv')
df8 = pd.read_csv('Dataset_Covid_CE_Anon_Gen_k8.csv')
df16 = pd.read_csv('Dataset_Covid_CE_Anon_Gen_k16.csv')

semi_id = [
    "municipioCaso",
    "sexoCaso",
    "dataNascimento",
    "resultadoFinalExame",
    "racaCor",
]


In [None]:
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [None]:
labels = semi_id
d = dict(df[semi_id].nunique())
d2 = dict(df2[semi_id].nunique())
d4 = dict(df4[semi_id].nunique())
d8 = dict(df8[semi_id].nunique())
d16 = dict(df16[semi_id].nunique())

x = np.arange(len(labels))
width = 0.15

fig, ax = plt.subplots(figsize=(20, 10))
rects1 = ax.bar(x - width*2, d.values(), width, label='original')
rects2 = ax.bar(x - width, d2.values(), width, label='k=2')
rects3 = ax.bar(x, d4.values(), width, label='k=4')
rects4 = ax.bar(x + width, d8.values(), width, label='k=8')
rects5 = ax.bar(x + width*2, d16.values(), width, label='k=16')

ax.set_title('Nº de valores únicos por semi-identificador')
ax.set_ylabel('Nº de valores únicos')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)
autolabel(rects5)

fig.tight_layout()
fig.savefig('uniq_semi_id')
plt.show()


In [None]:
dfs = [df, df2, df4, df8, df16]

m = [t['sexoCaso'].value_counts()['MASCULINO'] for t in dfs]
f = [t['sexoCaso'].value_counts()['FEMININO'] for t in dfs]
na = [t['sexoCaso'].value_counts()['*'] for t in dfs]

labels = ['original', 'k=2', 'k=4', 'k=8', 'k=16']
x = np.arange(len(labels))
width = 0.25

fig, ax = plt.subplots(figsize=(18, 6))
rects1 = ax.bar(x - width, m, width, label='MASCULINO')
rects2 = ax.bar(x, f, width, label='FEMININO')
rects3 = ax.bar(x + width, na, width, label='*')

ax.set_title('Frequência dos sexos em sexoCaso por valor de k')
ax.set_ylabel('Nº de ocorrências')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

fig.tight_layout()
fig.savefig('freq_sexoCaso')
plt.show()


In [None]:
labels = ['original', 'k=2', 'k=4', 'k=8', 'k=16']
y = [t.stack().value_counts()['*'] for t in dfs]
x = np.arange(len(labels))
width = 0.25

print(y)

fig, ax = plt.subplots()
rects = ax.bar(x, y, width)

ax.set_title('Ocorrência de "*" por valor de k')
ax.set_ylabel('Nº de ocorrências')
ax.set_xticks(x)
ax.set_xticklabels(labels)

autolabel(rects)

fig.tight_layout()
fig.savefig('freq_supressed_or_null')
plt.show()


In [None]:
# diferença entre df e df2
dfx2 = df[~df.apply(tuple, 1).isin(df2.apply(tuple, 1))].shape[0]
dfx4 = df[~df.apply(tuple, 1).isin(df4.apply(tuple, 1))].shape[0]
dfx8 = df[~df.apply(tuple, 1).isin(df8.apply(tuple, 1))].shape[0]
dfx16 = df[~df.apply(tuple, 1).isin(df16.apply(tuple, 1))].shape[0]

labels = ['k=2', 'k=4', 'k=8', 'k=16']
y = [dfx2, dfx4, dfx8, dfx16]
x = np.arange(len(labels))
width = .5

fig, ax = plt.subplots(figsize=(8, 8))
rects = ax.bar(x, y, width)

ax.set_title(
    f'Quantidade de registros diferentes do original (qnt. total de registros: {df.shape[0]})')
ax.set_ylabel('Nº de registros')
ax.set_xticks(x)
ax.set_xticklabels(labels)

autolabel(rects)

fig.tight_layout()
fig.savefig('difference')
plt.show()
