# Bayesian Networks

## Imports

In [146]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

from pomegranate import HiddenMarkovModel, State, DiscreteDistribution

## Tratamento do data set

In [147]:
data = pd.read_csv("dataset_thyroid_sick.csv")
#remoção de linhas com ?, substituição de valores boolean para int, assim como as da string das labels
data = data.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','TBG','referral_source','query_on_thyroxine'], axis=1)
data = data[data != '?']
data = data.dropna()
data = data.replace({'negative': 0, 'sick': 1})
data = data.replace({'t': 1, 'f': 0})
data = data.replace({'M': 1, 'F': 0})
data.reset_index(drop=True)

data['age'] = data['age'].astype('int')
data['T3'] = data['T3'].astype('float')
data['TT4'] = data['TT4'].astype('float')
data['T4U'] = data['T4U'].astype('float')
data['Class'] = data['Class'].astype('int')

## Undersampling

In [148]:
number_sick = len(data[data.Class == 1])
number_normal = len(data[data.Class == 0])

sick_indices = np.array(data[data.Class == 1].index)
normal_indices = np.array(data[data.Class == 0].index)

random_normal_indices = np.random.choice(normal_indices, number_sick, replace = False)
under_sample_indices = np.concatenate([sick_indices, random_normal_indices])

under_sample_data = data.loc[under_sample_indices,:]

sizes = [number_sick, len(random_normal_indices)]

X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, test_size = 0.3, random_state = 0)

print(X_train_undersample.shape[0],X_test_undersample.shape[0])

296 128


## Separação dos dados em treino e teste

In [149]:
bn_features = ["age","T3", "TT4","T4U"]
bn_data = X_train_undersample[bn_features].copy()
bn_data["Class"] = y_train_undersample

test_data = X_test_undersample[bn_features].copy()
test_data["Class"] = y_test_undersample

print(bn_data)

      age   T3    TT4   T4U  Class
3760   77  1.3  102.0  0.95      0
1793   55  1.0  102.0  0.82      1
2302   71  1.9  133.0  1.09      0
1035   25  0.7  122.0  1.13      1
1453   58  2.0  151.0  1.05      0
...   ...  ...    ...   ...    ...
159    58  1.6  103.0  0.90      0
3462   72  1.0   97.0  0.65      1
1973   74  1.0  102.0  0.90      1
663    73  1.4   48.0  1.05      1
3167   55  1.0   86.0  0.84      1

[296 rows x 5 columns]


## Função para classificar as features

In [150]:
def categoryClassification(bn_data):
    #age
    Q1 = 20
    Q3 = 50
    dataframe = bn_data.copy()
    L_idx = dataframe.loc[dataframe['age'] <= Q1, 'age'].index
    M_idx = dataframe.loc[(dataframe['age'] > Q1) & (dataframe['age'] <= Q3), 'age'].index
    H_idx = dataframe.loc[dataframe['age'] > Q3, 'age'].index

    dataframe.loc[L_idx, 'age'] = "L"
    dataframe.loc[M_idx, 'age'] = "M"
    dataframe.loc[H_idx, 'age'] = "H"


    #T3
    Q1 = 0.9
    Q3 = 2.8

    G_idx = dataframe.loc[(dataframe['T3'] >= Q1) & (dataframe['T3'] <= Q3), 'T3'].index
    B_idx = dataframe.loc[(dataframe['T3'] < Q1) | (dataframe['T3'] > Q3), 'T3'].index

    dataframe.loc[G_idx, 'T3'] = "G"
    dataframe.loc[B_idx, 'T3'] = "B"


    #TT4
    Q1 = 50.0
    Q3 = 120.0

    G_idx = dataframe.loc[(dataframe['TT4'] >= Q1) & (dataframe['TT4'] <= Q3), 'TT4'].index
    B_idx = dataframe.loc[(dataframe['TT4'] < Q1) | (dataframe['TT4'] > Q3), 'TT4'].index

    dataframe.loc[G_idx, 'TT4'] = "G"
    dataframe.loc[B_idx, 'TT4'] = "B"


    #T4U
    Q1 = 0.8
    Q3 = 1.8

    G_idx = dataframe.loc[(dataframe['T4U'] >= Q1) & (dataframe['T4U'] <= Q3), 'T4U'].index
    B_idx = dataframe.loc[(dataframe['T4U'] < Q1) | (dataframe['T4U'] > Q3), 'T4U'].index

    dataframe.loc[G_idx, 'T4U'] = "G"
    dataframe.loc[B_idx, 'T4U'] = "B"

    dataframe.loc[dataframe["Class"] == 0, "Class"] = "N"
    dataframe.loc[dataframe["Class"] == 1, "Class"] = "P"

    return dataframe.copy()

In [151]:
bn_data = categoryClassification(bn_data)
test_data = categoryClassification(test_data)
print(bn_data)
print(test_data)

     age T3 TT4 T4U Class
3760   H  G   G   G     N
1793   H  G   G   G     P
2302   H  G   B   G     N
1035   M  B   B   G     P
1453   H  G   B   G     N
...   .. ..  ..  ..   ...
159    H  G   G   G     N
3462   H  G   G   B     P
1973   H  G   G   G     P
663    H  G   B   G     P
3167   H  G   G   G     P

[296 rows x 5 columns]
     age T3 TT4 T4U Class
2856   H  B   G   G     P
309    H  G   G   G     P
2477   H  G   G   G     N
158    H  G   B   G     P
1621   H  B   G   G     P
...   .. ..  ..  ..   ...
1611   H  G   G   G     N
2464   M  G   B   G     N
1612   M  G   B   G     N
1176   H  G   B   G     N
2251   H  G   G   G     N

[128 rows x 5 columns]


## Função para calcular a probabilidade da feature

In [152]:
def featureProbFunction(bn_data):
    feature_prob = {}
    for feature in bn_data.columns:
        if (feature == "Class"):
            continue
        if (feature == "age"):
            continue
        prob = {}
        prob["B"] = bn_data[bn_data[feature] == 'B'].shape[0]/bn_data.shape[0]
        prob["G"] = bn_data[bn_data[feature] == 'G'].shape[0]/bn_data.shape[0]
        feature_prob[feature] = prob

    prob = {}
    feature = "age" 
    prob["L"] = bn_data[bn_data[feature] == 'L'].shape[0]/bn_data.shape[0]
    prob["M"] = bn_data[bn_data[feature] == 'M'].shape[0]/bn_data.shape[0]
    prob["H"] = bn_data[bn_data[feature] == 'H'].shape[0]/bn_data.shape[0]
    feature_prob[feature] = prob
    
    return feature_prob

In [153]:
feature_prob = featureProbFunction(bn_data)
print(feature_prob)

{'T3': {'B': 0.28378378378378377, 'G': 0.7162162162162162}, 'TT4': {'B': 0.24662162162162163, 'G': 0.7533783783783784}, 'T4U': {'B': 0.22972972972972974, 'G': 0.7702702702702703}, 'age': {'L': 0.016891891891891893, 'M': 0.2702702702702703, 'H': 0.7128378378378378}}


## Função para calcular as probabilidades das features em função das classes

In [154]:
def veracityProbFunction(bn_data):
    veracity_prob = {}
    for feature in bn_data.columns:
        count_P = 0
        count_N = 0
        count_GN = 0
        count_BN = 0
        count_GP = 0
        count_BP = 0
        if (feature == "Class"):
            continue
        elif (feature == "age"):
            continue
        else:
            for index, row in bn_data.iterrows():
                if row["Class"] == "P":
                    count_P += 1
                    if row[feature] == "G":
                        count_GP += 1
                    if row[feature] == "B":
                        count_BP += 1
                if row["Class"] == "N":
                    count_N += 1
                    if row[feature] == "G":
                        count_GN += 1
                    if row[feature] == "B":
                        count_BN += 1
            veracity_prob[feature] = {"GP": count_GP/count_P, "BP": count_BP/count_P, "GN": count_GN/count_N, "BN": count_BN/count_N}


    count_LP = 0
    count_MP = 0
    count_HP = 0
    count_LN = 0
    count_MN = 0
    count_HN = 0

    for index, row in bn_data.iterrows():
        if row["Class"] == "P":
            count_P += 1
            if row["age"] == "L":
                count_LP += 1
            elif row["age"] == "M":
                count_MP += 1
            else:
                count_HP += 1
        if row["Class"] == "N":
            count_N += 1
            if row["age"] == "L":
                count_LN += 1
            elif row["age"] == "M":
                count_MN += 1
            else:
                count_HN += 1
    veracity_prob["age"] = {"LP": count_LP/count_P, "MP": count_MP/count_P, "HP": count_HP/count_P, "LN": count_LN/count_N, "MN": count_MN/count_N, "HN": count_HN/count_N}

    veracity_prob["Class"] = {"P": bn_data[bn_data.Class == 'P'].shape[0]/bn_data.shape[0],"N": bn_data[bn_data.Class == 'N'].shape[0]/bn_data.shape[0]}

    return veracity_prob

In [155]:
veracity_prob = veracityProbFunction(bn_data)
print(veracity_prob)

{'T3': {'GP': 0.5384615384615384, 'BP': 0.46153846153846156, 'GN': 0.8823529411764706, 'BN': 0.11764705882352941}, 'TT4': {'GP': 0.8041958041958042, 'BP': 0.1958041958041958, 'GN': 0.7058823529411765, 'BN': 0.29411764705882354}, 'T4U': {'GP': 0.6083916083916084, 'BP': 0.3916083916083916, 'GN': 0.9215686274509803, 'BN': 0.0784313725490196}, 'age': {'LP': 0.013986013986013986, 'MP': 0.16783216783216784, 'HP': 0.8181818181818182, 'LN': 0.0196078431372549, 'MN': 0.3660130718954248, 'HN': 0.6143790849673203}, 'Class': {'P': 0.4831081081081081, 'N': 0.5168918918918919}}


## Função para calcular as probabilidades das combinações

In [156]:
def classProbFunction(bn_data):
    class_prob = {}
    for index, row in bn_data.iterrows():
        combination = row['age'] + row['T3'] + row['TT4'] + row['T4U']
        if combination not in class_prob:
            class_prob[combination] = 1;
        else:
            class_prob[combination] = class_prob[combination] +1

    for key in class_prob:
        class_prob[key] = class_prob[key]/bn_data.shape[0]
    return class_prob

In [157]:
class_prob = classProbFunction(bn_data)
print(class_prob)

{'HGGG': 0.30067567567567566, 'HGBG': 0.11148648648648649, 'MBBG': 0.02027027027027027, 'MBGG': 0.010135135135135136, 'MGGG': 0.14189189189189189, 'HBGB': 0.10135135135135136, 'HBGG': 0.08445945945945946, 'HGGB': 0.07094594594594594, 'HBBG': 0.037162162162162164, 'MGGB': 0.02702702702702703, 'MGBG': 0.05067567567567568, 'HBBB': 0.0033783783783783786, 'MBBB': 0.006756756756756757, 'HGBB': 0.0033783783783783786, 'MBGB': 0.013513513513513514, 'LGBG': 0.006756756756756757, 'LBBB': 0.0033783783783783786, 'LGGG': 0.0033783783783783786, 'LBBG': 0.0033783783783783786}


In [158]:
sum(class_prob.values())

1.0

## Função para testar

In [159]:
def testFunction(bn_data):
    test_class = []
    for index, row in bn_data.iterrows():
        prob_P = 1
        prob_N = 1
        for feature in bn_data.columns:
            if (feature != "Class"):
                prob_P = veracity_prob["age"][row["age"]+"P"]*veracity_prob["T3"][row["T3"]+"P"]*veracity_prob["TT4"][row["TT4"]+"P"]*veracity_prob["T4U"][row["T4U"]+"P"]*veracity_prob["Class"]["P"]
                prob_N = veracity_prob["age"][row["age"]+"N"]*veracity_prob["T3"][row["T3"]+"N"]*veracity_prob["TT4"][row["TT4"]+"N"]*veracity_prob["T4U"][row["T4U"]+"N"]*veracity_prob["Class"]["N"]
        if (prob_P > prob_N):
            test_class.append("P")
        else:
            test_class.append("N")
              
    return test_class

In [160]:
test_class = testFunction(test_data)
print(test_class)

['P', 'N', 'N', 'N', 'P', 'N', 'N', 'P', 'P', 'N', 'N', 'P', 'N', 'N', 'P', 'N', 'N', 'N', 'N', 'N', 'P', 'P', 'N', 'N', 'N', 'N', 'P', 'N', 'N', 'P', 'N', 'P', 'P', 'N', 'P', 'P', 'N', 'P', 'P', 'N', 'P', 'N', 'N', 'N', 'P', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'P', 'N', 'N', 'P', 'N', 'P', 'N', 'P', 'P', 'P', 'P', 'N', 'N', 'P', 'N', 'P', 'P', 'P', 'N', 'P', 'N', 'P', 'P', 'N', 'N', 'N', 'N', 'P', 'N', 'P', 'N', 'P', 'N', 'P', 'P', 'P', 'N', 'P', 'N', 'N', 'P', 'N', 'N', 'P', 'N', 'N', 'P', 'N', 'N', 'P', 'P', 'P', 'N', 'N', 'N', 'N', 'N', 'P', 'N', 'N', 'N', 'N', 'N', 'P', 'N', 'N', 'N', 'N', 'N', 'N', 'N']


In [161]:
count_P = 0
count_N = 0
for x in test_class:
    if x == "P":
        count_P += 1
    else:
        count_N += 1

print(count_P,count_N,count_P + count_N)

47 81 128


In [162]:
test_data["prediction class"] = np.array(test_class)

## Accuracy

In [164]:
print("Accuracy de BN: ",test_data[test_data["Class"] == test_data["prediction class"]].shape[0] / test_data.shape[0])

Accuracy de BN:  0.765625
