In [26]:
import pandas as pd
import numpy as np
from pgmpy.factors.discrete import TabularCPD
from pgmpy.models import BayesianModel
import matplotlib.pyplot as plt
import networkx as nx
from pgmpy.inference import VariableElimination
from tqdm import tqdm
from pgmpy.readwrite import BIFWriter
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_recall_fscore_support
import pickle

In [3]:
feature_data = pd.read_csv("../../dataset/15features.csv")

In [4]:
for idx,columns in enumerate(feature_data.columns):
    if columns != "imagename":
        feature_data[str(columns)] = feature_data[str(columns)] - 1

In [5]:
feature_data.head()

Unnamed: 0,imagename,pen_pressure,letter_spacing,size,dimension,is_lowercase,is_continuous,slantness,tilt,entry_stroke_a,staff_of_a,formation_n,staff_of_d,exit_stroke_d,word_formation,constancy
0,0968c_num1.png,1,1,1,0,1,1,2,1,0,1,1,2,1,1,0
1,0809c_num2.png,1,1,1,1,1,1,2,0,0,1,1,2,0,1,1
2,0237b_num6.png,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
3,0069b_num2.png,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0
4,0966c_num4.png,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1


In [6]:
seen_train = pd.read_csv("../../dataset/unseen-dataset/dataset_seen_training_siamese.csv")
val_data = pd.read_csv("../../dataset/unseen-dataset/dataset_seen_validation_siamese.csv")
val_data.head()

Unnamed: 0.1,Unnamed: 0,left,right,label
0,0,1469b_num3.png,1469b_num2.png,1
1,1,1469b_num3.png,1469c_num2.png,1
2,2,1469b_num3.png,1469a_num1.png,1
3,3,1469b_num3.png,1469a_num3.png,1
4,4,1469b_num3.png,1469c_num1.png,1


In [7]:
trainData = pd.merge(seen_train,feature_data.add_suffix('1'),left_on="left",right_on="imagename1",how="inner")
trainData = pd.merge(trainData,feature_data.add_suffix('2'),left_on="right",right_on="imagename2",how="inner")
trainData = trainData.drop(["Unnamed: 0","imagename1","imagename2"],axis=1)
val_data = pd.merge(val_data,feature_data.add_suffix('1'),left_on="left",right_on="imagename1",how="inner")
val_data = pd.merge(val_data,feature_data.add_suffix('2'),left_on="right",right_on="imagename2",how="inner")
val_data = val_data.drop(["Unnamed: 0","imagename1","imagename2"],axis=1)

In [8]:
val_data.head()

Unnamed: 0,left,right,label,pen_pressure1,letter_spacing1,size1,dimension1,is_lowercase1,is_continuous1,slantness1,...,is_continuous2,slantness2,tilt2,entry_stroke_a2,staff_of_a2,formation_n2,staff_of_d2,exit_stroke_d2,word_formation2,constancy2
0,1469b_num3.png,1469b_num2.png,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,1,1,1,1
1,1453b_num4.png,1469b_num2.png,0,0,1,1,1,1,0,0,...,0,0,0,0,0,1,1,1,1,1
2,1513b_num3.png,1469b_num2.png,0,0,1,1,0,1,0,1,...,0,0,0,0,0,1,1,1,1,1
3,1515c_num2.png,1469b_num2.png,0,1,2,1,1,1,1,3,...,0,0,0,0,0,1,1,1,1,1
4,1517b_num1.png,1469b_num2.png,0,1,2,0,0,1,1,2,...,0,0,0,0,0,1,1,1,1,1


In [9]:
combined_model = BayesianModel([('pen_pressure1','is_pen_pressure_sim'),
                                ('pen_pressure2','is_pen_pressure_sim'),
                                ('slantness1','is_slantness_sim'),
                                ('slantness2','is_slantness_sim'),
                                ('tilt1','is_tilt_sim'),
                                ('tilt2','is_tilt_sim'),
                                ('is_slantness_sim','is_tilt_sim'),
                                ('staff_of_a1','is_staff_of_a_sim'),
                                ('staff_of_a2','is_staff_of_a_sim'),
                                ('staff_of_d1','is_staff_of_d_sim'),
                                ('staff_of_d2','is_staff_of_d_sim'),
                                ('is_staff_of_a_sim','is_staff_of_d_sim'),
                                ('entry_stroke_a1','entry_stroke_a_sim'),
                                ('entry_stroke_a2','entry_stroke_a_sim'),
                                ('exit_stroke_d1','is_exit_stroke_d_sim'),
                                ('exit_stroke_d2','is_exit_stroke_d_sim'),
                                ('entry_stroke_a_sim','is_exit_stroke_d_sim'),
                                ('is_lowercase1','is_lowercase_sim'),
                                ('is_lowercase2','is_lowercase_sim'),
                                ('is_continuous1','is_continuous_sim'),
                                ('is_continuous2','is_continuous_sim'),
                                ('is_lowercase_sim','is_continuous_sim'),
                                ('dimension1','dimension_sim'),
                                ('dimension2','dimension_sim'),
                                ('letter_spacing1','letter_spacing_sim'),
                                ('letter_spacing2','letter_spacing_sim'),
                                ('size1','size_sim'),
                                ('size2','size_sim'),
                                ('dimension_sim','size_sim'),
                                ('letter_spacing_sim','size_sim'),
                                ('constancy1','constancy_sim'),
                                ('constancy2','constancy_sim'),
                                ('size_sim','constancy_sim'),
                                ('word_formation1','word_formation_sim'),
                                ('word_formation2','word_formation_sim'),
                                ('constancy_sim','word_formation_sim'),
                                ('formation_n1','formation_n_sim'),
                                ('formation_n2','formation_n_sim'),
                                ('word_formation_sim','formation_n_sim')
                               ])

cpd_pen_pressure1 = TabularCPD('pen_pressure1',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_pen_pressure2 = TabularCPD('pen_pressure2',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_is_pen_pressure_sim = TabularCPD('is_pen_pressure_sim',2,[[0.1,0.9,0.9,0.1],
                                                            [0.9,0.1,0.1,0.9]],
                                                            evidence=['pen_pressure1','pen_pressure2'], 
                                                            evidence_card=[2,2])
cpd_slantness1 = TabularCPD('slantness1',4,[[0.25],[0.25],[0.25],[0.25]],
                                                evidence=[], evidence_card=[])
cpd_slantness2 = TabularCPD('slantness2',4,[[0.25],[0.25],[0.25],[0.25]],
                                                evidence=[], evidence_card=[])
cpd_is_slantness_sim = TabularCPD('is_slantness_sim',2,[[0.1,0.2,0.3,0.4,0.2,0.1,0.3,0.4,0.3,0.2,0.1,0.4,0.4,0.3,0.2,0.1],
                                                            [0.9,0.8,0.7,0.6,0.8,0.9,0.7,0.6,0.7,0.8,0.9,0.6,0.6,0.7,0.8,0.9]],
                                                            evidence=['slantness1','slantness2'], 
                                                            evidence_card=[4,4])
cpd_tilt1 = TabularCPD('tilt1',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_tilt2 = TabularCPD('tilt2',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_is_tilt_sim = TabularCPD('is_tilt_sim',2,[[0.4,0.1,0.9,0.6,0.9,0.6,0.4,0.1],
                                                            [0.6,0.9,0.1,0.4,0.1,0.4,0.6,0.9]],
                                                            evidence=['tilt1','tilt2','is_slantness_sim'], 
                                                            evidence_card=[2,2,2])
cpd_staff_of_a1 = TabularCPD('staff_of_a1',4,[[0.25],[0.25],[0.25],[0.25]],
                                                evidence=[], evidence_card=[])
cpd_staff_of_a2 = TabularCPD('staff_of_a2',4,[[0.25],[0.25],[0.25],[0.25]],
                                                evidence=[], evidence_card=[])
cpd_is_staff_of_a_sim = TabularCPD('is_staff_of_a_sim',2,[[0.1,0.2,0.3,0.4,0.2,0.1,0.3,0.4,0.3,0.2,0.1,0.4,0.4,0.3,0.2,0.1],
                                                            [0.9,0.8,0.7,0.6,0.8,0.9,0.7,0.6,0.7,0.8,0.9,0.6,0.6,0.7,0.8,0.9]],
                                                            evidence=['staff_of_a1','staff_of_a2'], 
                                                            evidence_card=[4,4])
cpd_staff_of_d1 = TabularCPD('staff_of_d1',3,[[0.33],
                                    [0.34],[0.33]],
                                    evidence=[], evidence_card=[])
cpd_staff_of_d2 = TabularCPD('staff_of_d2',3,[[0.33],
                                    [0.34],[0.33]],
                                    evidence=[], evidence_card=[])
cpd_is_staff_of_d_sim = TabularCPD('is_staff_of_d_sim',2,[[0.4,0.1,0.9,0.6,0.9,0.6,0.1,0.6,0.4,0.1,0.9,0.6,0.9,0.6,0.9,0.6,0.4,0.9],
                                              [0.6,0.9,0.1,0.4,0.1,0.4,0.9,0.4,0.6,0.9,0.1,0.4,0.1,0.4,0.1,0.4,0.6,0.1]],
                             evidence=['staff_of_d1','staff_of_d2','is_staff_of_a_sim'], 
                             evidence_card=[3,3,2])
cpd_exit_stroke_d1 = TabularCPD('exit_stroke_d1',4,[[0.25],[0.25],[0.25],[0.25]],
                                                evidence=[], evidence_card=[])
cpd_exit_stroke_d2 = TabularCPD('exit_stroke_d2',4,[[0.25],[0.25],[0.25],[0.25]],
                                                evidence=[], evidence_card=[])
cpd_is_exit_stroke_d_sim = TabularCPD('is_exit_stroke_d_sim',2,[[0.9,0.1,0.9,0.6,0.9,0.6,0.9,0.6,0.9,0.6,0.4,0.1,0.9,0.6,0.9,0.6,0.9,0.6,0.9,0.6,0.4,0.1,0.9,0.6,0.9,0.6,0.9,0.6,0.9,0.6,0.4,0.1],
                                                            [0.1,0.9,0.1,0.4,0.1,0.4,0.1,0.4,0.1,0.4,0.6,0.9,0.1,0.4,0.1,0.4,0.1,0.4,0.1,0.4,0.6,0.9,0.1,0.4,0.1,0.4,0.1,0.4,0.1,0.4,0.6,0.9]],
                                                            evidence=['exit_stroke_d1','exit_stroke_d2','entry_stroke_a_sim'], 
                                                            evidence_card=[4,4,2])

cpd_is_lowercase1 = TabularCPD('is_lowercase1',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_is_lowercase2 = TabularCPD('is_lowercase2',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_is_continuous1 = TabularCPD('is_continuous1',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_is_continuous2 = TabularCPD('is_continuous2',2,[[0.5],
                                                [0.5]],
                                                evidence=[], evidence_card=[])
cpd_dimension1 = TabularCPD('dimension1',3,[[0.33],
                                    [0.34],[0.33]],
                                                evidence=[], evidence_card=[])
cpd_dimension2 = TabularCPD('dimension2',3,[[0.33],
                                    [0.34],[0.33]],
                                                evidence=[], evidence_card=[])
cpd_letter_spacing1 = TabularCPD('letter_spacing1',3,[[0.33],
                                    [0.34],[0.33]],
                                    evidence=[], evidence_card=[])
cpd_letter_spacing2 = TabularCPD('letter_spacing2',3,[[0.33],
                                    [0.34],[0.33]],
                                    evidence=[], evidence_card=[])
cpd_size1 = TabularCPD('size1',3,[[0.33],
                                    [0.34],[0.33]],
                                    evidence=[], evidence_card=[])
cpd_size2 = TabularCPD('size2',3,[[0.33],
                                    [0.34],[0.33]],
                                    evidence=[], evidence_card=[])
cpd_constancy1 = TabularCPD('constancy1',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_constancy2 = TabularCPD('constancy2',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_word_formation1 = TabularCPD('word_formation1',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_word_formation2 = TabularCPD('word_formation2',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_formation_n1 = TabularCPD('formation_n1',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_formation_n2 = TabularCPD('formation_n2',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_entry_stroke_a1 = TabularCPD('entry_stroke_a1',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_entry_stroke_a2 = TabularCPD('entry_stroke_a2',2,[[0.5],
                                    [0.5]],
                                    evidence=[], evidence_card=[])
cpd_is_lowercase_sim = TabularCPD('is_lowercase_sim',2,[[0.1,0.9,0.9,0.1],
                                                            [0.9,0.1,0.1,0.9]],
                                                            evidence=['is_lowercase1','is_lowercase2'], 
                                                            evidence_card=[2,2])
cpd_is_continuous_sim = TabularCPD('is_continuous_sim',2,[[0.9,0.1,0.9,0.6,0.9,0.6,0.9,0.1],
                                                            [0.1,0.9,0.1,0.4,0.1,0.4,0.1,0.9]],
                                                            evidence=['is_continuous1','is_continuous2','is_lowercase_sim'], 
                                                            evidence_card=[2,2,2])
cpd_dimension_sim = TabularCPD('dimension_sim',2,[[0.1,0.8,0.9,0.8,0.1,0.8,0.9,0.8,0.1],
                                                [0.9,0.2,0.1,0.2,0.9,0.2,0.1,0.2,0.9]],
                                                evidence=['dimension1','dimension2'], evidence_card=[3,3])
cpd_letter_spacing_sim = TabularCPD('letter_spacing_sim',2,[[0.1,0.8,0.9,0.8,0.1,0.8,0.9,0.8,0.1],
                                                [0.9,0.2,0.1,0.2,0.9,0.2,0.1,0.2,0.9]],
                                                evidence=['letter_spacing1','letter_spacing2'], evidence_card=[3,3])
cpd_size_sim = TabularCPD('size_sim',2,[[0.6,0.3,0.3,0.1,0.8,0.7,0.7,0.3,0.9,0.8,0.7,0.4,0.7,0.6,0.6,0.3,0.6,0.3,0.3,0.1,0.8,0.4,0.4,0.85,0.9,0.8,0.8,0.3,0.8,0.4,0.4,0.85,0.6,0.3,0.3,0.1],
                                        [0.4,0.7,0.7,0.9,0.2,0.3,0.3,0.7,0.1,0.2,0.3,0.6,0.3,0.4,0.4,0.7,0.4,0.7,0.7,0.9,0.2,0.6,0.6,0.15,0.1,0.2,0.2,0.7,0.2,0.6,0.6,0.15,0.4,0.7,0.7,0.9]],
                                        evidence=['size1','size2','dimension_sim','letter_spacing_sim'], evidence_card=[3,3,2,2])
cpd_constancy_sim = TabularCPD('constancy_sim',2,[[0.9,0.1,0.9,0.6,0.9,0.6,0.7,0.1],
                                        [0.1,0.9,0.1,0.4,0.1,0.4,0.3,0.9]],
                                        evidence=['constancy1','constancy2','size_sim'], evidence_card=[2,2,2])
cpd_word_formation_sim = TabularCPD('word_formation_sim',2,[[0.9,0.1,0.9,0.7,0.9,0.7,0.9,0.1],
                                        [0.1,0.9,0.1,0.3,0.1,0.3,0.1,0.9]],
                                        evidence=['word_formation1','word_formation2','constancy_sim'], evidence_card=[2,2,2])
cpd_formation_n_sim = TabularCPD('formation_n_sim',2,[[0.7,0.1,0.9,0.4,0.9,0.4,0.6,0.1],
                                        [0.3,0.9,0.1,0.6,0.1,0.6,0.4,0.9]],
                                        evidence=['formation_n1','formation_n2','word_formation_sim'], evidence_card=[2,2,2])
cpd_entry_stroke_a_sim = TabularCPD('entry_stroke_a_sim',2,[[0.1,0.9,0.9,0.1],
                                                            [0.9,0.1,0.1,0.9]],
                                        evidence=['entry_stroke_a1','entry_stroke_a2'], evidence_card=[2,2])

combined_model.add_cpds(cpd_pen_pressure1,
                        cpd_pen_pressure2,
                        cpd_is_pen_pressure_sim,
                        cpd_slantness1,
                        cpd_slantness2,
                        cpd_is_slantness_sim,
                        cpd_tilt1,
                        cpd_tilt2,
                        cpd_is_tilt_sim,
                        cpd_staff_of_a1,
                        cpd_staff_of_a2,
                        cpd_is_staff_of_a_sim,
                        cpd_staff_of_d1,
                        cpd_staff_of_d2,
                        cpd_is_staff_of_d_sim,
                        cpd_exit_stroke_d1,
                        cpd_exit_stroke_d2,
                        cpd_is_exit_stroke_d_sim,
                        cpd_is_lowercase1,
                        cpd_is_lowercase2,
                        cpd_is_lowercase_sim,
                        cpd_is_continuous1,
                        cpd_is_continuous2,
                        cpd_is_continuous_sim,
                        cpd_dimension1,
                        cpd_dimension2,
                        cpd_dimension_sim,
                        cpd_letter_spacing1,
                        cpd_letter_spacing2,
                        cpd_letter_spacing_sim,
                        cpd_size1,
                        cpd_size2,
                        cpd_size_sim,
                        cpd_constancy1,
                        cpd_constancy2,
                        cpd_constancy_sim,
                        cpd_word_formation1,
                        cpd_word_formation2,
                        cpd_word_formation_sim,
                        cpd_formation_n1,
                        cpd_formation_n2,
                        cpd_formation_n_sim,
                        cpd_entry_stroke_a1,
                        cpd_entry_stroke_a2,
                        cpd_entry_stroke_a_sim
                       )
combined_model.check_model()

True

In [37]:
model_data = BIFWriter(combined_model)
model_data.write_bif(filename='weights/pgmModel.bif')

In [10]:
mle = VariableElimination(combined_model)

In [11]:
for idx,columns in enumerate(feature_data.columns):
    if idx != 0:
        print(str(np.unique(feature_data[columns]))+columns)

[0 1]pen_pressure
[0 1 2]letter_spacing
[0 1 2]size
[0 1 2]dimension
[0 1]is_lowercase
[0 1]is_continuous
[0 1 2 3]slantness
[0 1]tilt
[0 1]entry_stroke_a
[0 1 2 3]staff_of_a
[0 1]formation_n
[0 1 2]staff_of_d
[0 1 2 3]exit_stroke_d
[0 1]word_formation
[0 1]constancy


## Learning the weights in Structured CPD

### Training

In [None]:
'''
Used c4.8xlarge to run inferences for 3hours, around 10-11 iterations per second
On my pc i3 estimate was 5 hours
Using instances > c4.8xlarge doesn't affect much in inference
Pgmpy Why are you not parallelizable?
'''

'''
simFeatures = [[] for _ in range(100)]
var = {'is_pen_pressure_sim',
       'is_slantness_sim',
       'is_tilt_sim',
       'is_staff_of_a_sim',
       'is_staff_of_d_sim',
       'entry_stroke_a_sim',
       'is_exit_stroke_d_sim',
      'is_lowercase_sim',
      'is_continuous_sim',
      'dimension_sim',
      'letter_spacing_sim',
       'size_sim',
       'constancy_sim',
       'word_formation_sim',
       'formation_n_sim'
      }
evidence_labels = trainData.columns[3:]
for idx in tqdm(range(100)):
    inf = mle.query(variables=var,evidence=dict(zip(evidence_labels,trainData.iloc[idx,3:].tolist())))
    for simfeature in var:
        simFeatures[idx].append(np.argmax(inf[simfeature].values))

simDf = pd.DataFrame(data=simFeatures,columns=var)

simDf = pd.concat([simDf,trainData.label],axis=1)

simDf.to_csv("./sigTrainData.csv")

'''

In [16]:
var = {'is_pen_pressure_sim',
       'is_slantness_sim',
       'is_tilt_sim',
       'is_staff_of_a_sim',
       'is_staff_of_d_sim',
       'entry_stroke_a_sim',
       'is_exit_stroke_d_sim',
      'is_lowercase_sim',
      'is_continuous_sim',
      'dimension_sim',
      'letter_spacing_sim',
       'size_sim',
       'constancy_sim',
       'word_formation_sim',
       'formation_n_sim'
      }
evidence_labels = trainData.columns[3:]

In [13]:
'''simDf = pd.read_csv("./sigTrainData.csv")
simDf = simDf.iloc[0:,1:]
simDf.head()'''

'simDf = pd.read_csv("./sigTrainData.csv")\nsimDf = simDf.iloc[0:,1:]\nsimDf.head()'

In [13]:
'''xTrain = simDf.iloc[0:,0:15].values.tolist()
xTrain = np.array(xTrain)
yTrain = np.array(simDf.iloc[0:,15:].values.tolist())
yTrain = yTrain.ravel()'''

In [17]:
'''model = LogisticRegressionCV(cv=100, random_state=0,
                            fit_intercept=True,max_iter=10000).fit(xTrain, yTrain)

pred = model.predict(xTrain)

precision,recall,f1,_ = precision_recall_fscore_support(yTrain, pred, average='binary')
print(precision,recall,f1)'''

0.7442358016832874 0.7581095441364529 0.7511086128286804


In [18]:
'''weights = model.coef_'''

In [20]:
'''with open('weights/sigmoid_cpd_weight', 'wb') as fp:
    pickle.dump(weights, fp)'''

In [19]:
b = model.intercept_

In [21]:
'''with open('weights/sigmoid_cpd_bias', 'wb') as fp:
    pickle.dump(b, fp)'''

In [14]:
def deterministic_sigmoid_node(biasNodeAccumulatorNode):
    return 1/(1+np.exp(biasNodeAccumulatorNode*-1))

def deterministic_verification_node(nodeOutputs,weights,b):
    weightFeaturesAccumulatorNode = np.dot(weights,nodeOutputs)
    biasNodeAccumulatorNode = b[0]+weightFeaturesAccumulatorNode[0]
    return deterministic_sigmoid_node(biasNodeAccumulatorNode)

### Validation

In [17]:
simFeatures_val = [[] for _ in range(len(val_data))]
evidence_labels = val_data.columns[3:]
for idx in tqdm(range(len(val_data))):
    inf = mle.query(variables=var,evidence=dict(zip(evidence_labels,val_data.iloc[idx,3:].tolist())))
    for simfeature in var:
        simFeatures_val[idx].append(np.argmax(inf[simfeature].values))

  phi.values = phi.values[slice_]
  phi1.values = phi1.values[slice_]
100%|██████████| 7221/7221 [21:07<00:00,  5.84it/s]


In [20]:
'''deterministicNodePred = []
for nodeOutputs in simFeatures_val:
    deterministicNodePred.append(deterministic_verification_node(nodeOutputs,weights,b))

precision,recall,f1,_ = precision_recall_fscore_support(list(val_data.label), np.round(deterministicNodePred), average='binary')
print(precision,recall,f1)'''

"deterministicNodePred = []\nfor nodeOutputs in simFeatures_val:\n    deterministicNodePred.append(deterministic_verification_node(nodeOutputs,weights,b))\n\nprecision,recall,f1,_ = precision_recall_fscore_support(list(val_data.label), np.round(deterministicNodePred), average='binary')\nprint(precision,recall,f1)"

### Entropy based Approach

In [21]:
entropyDict={}
for feature in trainData.columns[3:]:
    feature_states = np.unique(trainData[feature])
    temp = []
    for state in feature_states:
        query = str(feature+"=="+str(state))
        prob = len(trainData.query(query))/len(trainData[feature])
        temp.append(prob)
    entropyDict[feature] = temp 

In [22]:
entropyRows_pos = []
entropyRows_neg = []
for idx_r in tqdm(range(len(trainData))):
    entropy = 0
    for idx_c,cols in enumerate(trainData.columns[3:]):
        prob = entropyDict[cols][int(trainData.iloc[idx_r,idx_c+3])]
        entropy -= prob * np.log2(prob)
    if(trainData.iloc[idx_r,2] == 1):
        entropyRows_pos.append(entropy)
    if(trainData.iloc[idx_r,2] == 0):
        entropyRows_neg.append(entropy)
entropyRows_pos = np.array(entropyRows_pos)
entropyRows_neg = np.array(entropyRows_neg)

100%|██████████| 127273/127273 [01:40<00:00, 1272.72it/s]


In [23]:
top100_pos = entropyRows_pos.argsort()[-500:][::-1]
top100_neg = entropyRows_neg.argsort()[-100:][::-1]
entropyData = np.append(top100_pos,top100_neg)

In [24]:
entropyTrain = [[] for _ in range(len(entropyData))]
entropyLabels = []
for i,idx in enumerate(tqdm(entropyData)):
    inf = mle.query(variables=var,evidence=dict(zip(evidence_labels,trainData.iloc[idx,3:].tolist())))
    entropyLabels.append(trainData.iloc[idx,32])
    for simfeature in var:
        entropyTrain[i].append(np.argmax(inf[simfeature].values))
entropyLabels = np.array(entropyLabels)
entropyTrain = np.array(entropyTrain)

100%|██████████| 600/600 [01:48<00:00,  5.89it/s]


In [27]:
model = LogisticRegressionCV(cv=10,random_state=0,
                            fit_intercept=True,max_iter=500,penalty='l2').fit(entropyTrain, entropyLabels)

pred = model.predict(entropyTrain)

precision,recall,f1,_ = precision_recall_fscore_support(entropyLabels, pred, average='binary')
print(precision,recall,f1)

0.6678082191780822 0.9848484848484849 0.7959183673469388


In [28]:
weights = model.coef_
b = model.intercept_

In [29]:
with open('../weights/sigmoid_cpd_weight_entropy_unseen', 'wb') as fp:
    pickle.dump(weights, fp)
with open('../weights/sigmoid_cpd_bias_entropy_unseen', 'wb') as fp:
    pickle.dump(b, fp)

In [30]:
deterministicNodePred = []
for nodeOutputs in simFeatures_val:
    deterministicNodePred.append(deterministic_verification_node(nodeOutputs,weights,b))
    
precision,recall,f1,_ = precision_recall_fscore_support(list(val_data.label), np.round(deterministicNodePred), average='binary')
print(precision,recall,f1)

0.48645179691956647 0.9584152851924698 0.6453504871819128


In [None]:
'''
Entropy based approach needs more research, there is an overfitting due to only high entropy values provided. Maybe
a different classifier could be used but then it wouldn't be sigmoid cpd. Regularization doesn't help.
'''