In [1]:
import pandas as pd
import numpy as np
import random
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
from pgmpy.factors.discrete import TabularCPD
from collections import defaultdict
from pgmpy.estimators import K2Score
import time
from os import listdir
from os.path import isfile, join

## Preprocessing

In [2]:
data = pd.read_csv('../15features.csv')
data = data.set_index('imagename')


data[data.columns]-=1
data.head()

Unnamed: 0_level_0,pen_pressure,letter_spacing,size,dimension,is_lowercase,is_continuous,slantness,tilt,entry_stroke_a,staff_of_a,formation_n,staff_of_d,exit_stroke_d,word_formation,constancy
imagename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0968c_num1.png,1,1,1,0,1,1,2,1,0,1,1,2,1,1,0
0809c_num2.png,1,1,1,1,1,1,2,0,0,1,1,2,0,1,1
0237b_num6.png,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
0069b_num2.png,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0
0966c_num4.png,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1


In [0]:
def data_csv(data_dir):
    train_data = pd.read_csv(join(data_dir,"dataset_seen_training_siamese.csv"),index_col=0)
    val_data = pd.read_csv(join(data_dir,"dataset_seen_validation_siamese.csv"),index_col=0)
    train_data = train_data.values.tolist()
    val_data = val_data.values.tolist()
    tmp = []
    for i in train_data:
        try:
            tmp.append(list(data.loc[i[0]])+list(data.loc[i[1]])+[i[2]])
        except:
            continue
    train_data = tmp
    tmp = []
    for i in val_data:
        try:
            tmp.append(list(data.loc[i[0]])+list(data.loc[i[1]])+[i[2]])
        except:
            continue
    val_data = tmp
    train_data = pd.DataFrame(train_data)
    val_data = pd.DataFrame(val_data)
    train_data.columns = val_data.columns = ["f%d"%i for i in range(1,16)]+["g%d"%i for i in range(1,16)]+["op"]
    return train_data,val_data

In [0]:
seen_train, seen_val = data_csv('../seen-dataset')
unseen_train, unseen_val = data_csv('../unseen-dataset')
shuffled_train, shuffled_val = data_csv('../shuffled-dataset')

## Check correlation of edges

In [0]:
corr = []
col = list(data.columns)
for i in col:
    tmp = []
    for j in col:
        tmp.append(np.correlate(data[i],data[j]))
    corr.append(tmp)

In [0]:
data.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15
imagename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0968c_num1.png,1,1,1,0,1,1,2,1,0,1,1,2,1,1,0
0809c_num2.png,1,1,1,1,1,1,2,0,0,1,1,2,0,1,1
0237b_num6.png,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
0069b_num2.png,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0
0966c_num4.png,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1


In [0]:
corr =np.array(corr)

In [0]:
# Min max normalised correlation
corr = (corr-np.mean(corr))/(np.max(corr)-np.min(corr))

In [0]:
corr_dic = defaultdict(lambda: 0)

In [0]:
for i in range(15):
    for j in range(i,15):
        print("%.3f"%corr[i][j],end= "\t ")
        if i!=j:
            corr_dic["f%d f%d"%(i,j)] =corr[i][j]
    print()

-0.020	 0.002	 -0.054	 -0.081	 -0.021	 -0.088	 -0.062	 -0.207	 -0.232	 0.028	 -0.078	 0.062	 0.060	 -0.155	 -0.115	 
0.347	 0.146	 0.092	 0.142	 0.001	 0.038	 -0.172	 -0.229	 0.245	 0.057	 0.265	 0.246	 -0.076	 -0.008	 
0.332	 0.222	 0.134	 -0.022	 -0.003	 -0.177	 -0.234	 0.252	 0.085	 0.233	 0.177	 -0.042	 0.020	 
0.231	 0.083	 -0.054	 -0.039	 -0.186	 -0.236	 0.188	 0.046	 0.169	 0.122	 -0.066	 -0.012	 
0.131	 0.008	 0.035	 -0.177	 -0.226	 0.220	 0.043	 0.253	 0.231	 -0.086	 -0.020	 
0.008	 -0.041	 -0.199	 -0.228	 0.058	 -0.067	 0.124	 0.129	 -0.154	 -0.106	 
0.297	 -0.178	 -0.227	 0.087	 -0.037	 0.139	 0.123	 -0.141	 -0.082	 
-0.176	 -0.244	 -0.161	 -0.195	 -0.153	 -0.160	 -0.221	 -0.208	 
-0.226	 -0.223	 -0.232	 -0.219	 -0.214	 -0.239	 -0.235	 
0.656	 0.130	 0.372	 0.339	 -0.027	 0.056	 
0.048	 0.127	 0.111	 -0.088	 -0.032	 
0.566	 0.440	 -0.041	 0.049	 
0.756	 -0.044	 0.042	 
-0.082	 -0.104	 
-0.016	 


In [0]:
sorted(corr_dic.items(), key=lambda x: x[1],reverse = True)[:30]
# print top 30 candidates

[('f11 f12', array([0.43984617])),
 ('f9 f11', array([0.37178676])),
 ('f9 f12', array([0.3391035])),
 ('f1 f11', array([0.26472287])),
 ('f4 f11', array([0.25256231])),
 ('f2 f9', array([0.25233554])),
 ('f1 f12', array([0.24615606])),
 ('f1 f9', array([0.2450222])),
 ('f2 f11', array([0.23337188])),
 ('f4 f12', array([0.23050891])),
 ('f2 f3', array([0.22243022])),
 ('f4 f9', array([0.22024756])),
 ('f3 f9', array([0.18849973])),
 ('f2 f12', array([0.17659429])),
 ('f3 f11', array([0.16916756])),
 ('f1 f2', array([0.14592362])),
 ('f1 f4', array([0.14221025])),
 ('f6 f11', array([0.13883705])),
 ('f2 f4', array([0.1338481])),
 ('f9 f10', array([0.12996466])),
 ('f5 f12', array([0.12871743])),
 ('f10 f11', array([0.12678988])),
 ('f5 f11', array([0.12378517])),
 ('f6 f12', array([0.12299148])),
 ('f3 f12', array([0.12177259])),
 ('f10 f12', array([0.11057581])),
 ('f1 f3', array([0.09178222])),
 ('f6 f9', array([0.08738855])),
 ('f2 f10', array([0.08548935])),
 ('f3 f4', array([0.0834

## Check k2 score of individual edges

In [0]:
data.columns = ["f%d"%i for i in range(1,16)]

In [0]:
data.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15
imagename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0968c_num1.png,1,1,1,0,1,1,2,1,0,1,1,2,1,1,0
0809c_num2.png,1,1,1,1,1,1,2,0,0,1,1,2,0,1,1
0237b_num6.png,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
0069b_num2.png,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0
0966c_num4.png,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1


In [0]:
k2 = defaultdict(lambda:None )

In [0]:
k = K2Score(data)

for i in range(15):
    for j in range(i,15):
        if i!=j and i!=4 and j!=4:
            model = BayesianModel()
            model.add_edge("f%d"%(i+1),"f%d"%(j+1))
            k2["f%d ,f%d"%(i+1,j+1)]=k.score(model)
        

In [0]:
sorted(k2.items(), key=lambda x: x[1],reverse = True)[:30]
# print top 30 candidates

[('f8 ,f9', -9524.033680293294),
 ('f9 ,f11', -10285.122960875375),
 ('f6 ,f9', -11452.340941317532),
 ('f9 ,f15', -12086.035915314536),
 ('f1 ,f9', -12098.069402164354),
 ('f9 ,f14', -12248.037601392767),
 ('f8 ,f11', -13870.228943654525),
 ('f11 ,f15', -14851.989244428369),
 ('f6 ,f8', -15204.614812135576),
 ('f11 ,f14', -15234.772889889231),
 ('f6 ,f11', -15630.797470868172),
 ('f8 ,f15', -15665.707773198032),
 ('f1 ,f8', -15725.475170057267),
 ('f9 ,f12', -15752.811190782915),
 ('f8 ,f14', -15820.858078680849),
 ('f1 ,f11', -16459.475622050362),
 ('f4 ,f9', -16578.909919642945),
 ('f3 ,f9', -16718.845178027514),
 ('f14 ,f15', -16744.13799097287),
 ('f2 ,f9', -16879.229972134784),
 ('f6 ,f15', -17622.73650249384),
 ('f6 ,f14', -17720.96756774361),
 ('f1 ,f6', -17765.47675687581),
 ('f9 ,f10', -17963.165791796662),
 ('f1 ,f15', -18275.541867408414),
 ('f7 ,f9', -18343.3103816863),
 ('f1 ,f14', -18435.44540413588),
 ('f8 ,f12', -19356.736126379303),
 ('f11 ,f12', -19888.266571680062),

In [0]:
sum(list(data['f5']))/len(data["f5"]) # Probability of lowercase

0.984966838614591

## Generate intiutive models

In [0]:
img1 = [('f8','f9'),('f9','f11'),('f9','f14'),('f11','f14'),('f14','f15'),('f15','f6'),('f6','f12')]
img2 = [("g"+x[1:],"g"+y[1:]) for x,y in img1]
op = [('f12','op'),('g12','op')]

In [0]:
edges = img1 + img2 + op

In [0]:
edges

[('f8', 'f9'),
 ('f9', 'f11'),
 ('f9', 'f14'),
 ('f11', 'f14'),
 ('f14', 'f15'),
 ('f15', 'f6'),
 ('f6', 'f12'),
 ('g8', 'g9'),
 ('g9', 'g11'),
 ('g9', 'g14'),
 ('g11', 'g14'),
 ('g14', 'g15'),
 ('g15', 'g6'),
 ('g6', 'g12'),
 ('f12', 'op'),
 ('g12', 'op')]

In [0]:
edges2 = [(y,x) for x,y in img1]+ [(y,x) for x,y in img2] +[('f9','op'),('g9','op')]
edges3 = [("f%d"%i,"f%d"%(i+1)) for i in range(1,15) ]+ [("g%d"%i,"g%d"%(i+1)) for i in range(1,15) ]+[('f15','op'),('g15','op')]
edges4 = [("f%d"%(i),"f%d"%(i-1)) for i in range(15,1,-1)]+ [("g%d"%(i),"g%d"%(i-1)) for i in range(15,1,-1) ]+[('f15','op'),('g15','op')]
edges5 = [('f7','f8'),('f12','f14'),('g7','g8'),('g12','g14'),('f8','op'),('f12','op'),('g8','op'),('g12','op')]

In [0]:
def accuracy(pred,obs):
    total = (pred == obs).astype('int64')
    total = total.values.tolist()
    return sum(total)/len(pred)


## Inference using 30 features

In [0]:
def train_30(train,val,edges):
    model = BayesianModel(edges)
    train_time = time.time()
    model.fit(train)
    train_time = time.time()-train_time
    predict_data = val.copy()
    predict_time = time.time()
    predict_data = predict_data[model.nodes()]
    predict_data.drop('op', axis=1, inplace=True)
    y_pred = model.predict(predict_data)
    predict_time = time.time()-predict_time
    y_pred.columns = ['op']
    acc = accuracy(y_pred['op'],val['op'])
    return model,acc,train_time,predict_time

def train_all_30(edges):
    seen_model = train_30(seen_train,seen_val,edges)
    unseen_model = train_30(unseen_train,unseen_val,edges)
    shuffled_model = train_30(shuffled_train,shuffled_val,edges)
    return seen_model,unseen_model,shuffled_model

In [0]:
seen_30, unseen_30, shuffled_30 = train_all_30(edges)

In [0]:
print("Inferencing using features of two image at once\n")
print("Accuracy on seen data: %f"%seen_30[1])
print("Runtime on seen data: %f s"%seen_30[2])
print("Infer time on seen data: %f s"%seen_30[3])
print()
print("Accuracy on unseen data: %f"%unseen_30[1])
print("Runtime on unseen data: %f s"%unseen_30[2])
print("Infer time on unseen data: %f s"%unseen_30[3])
print()
print("Accuracy on shuffled data: %f"%shuffled_30[1])
print("Runtime on shuffled data: %f"%shuffled_30[2])
print("Infer time on shuffled data: %f"%shuffled_30[3])

Inferencing using features of two image at once

Accuracy on seen data: 0.860179
Runtime on seen data: 0.576436 s
Infer time on seen data: 14.844556 s

Accuracy on unseen data: 0.716521
Runtime on unseen data: 0.618504 s
Infer time on unseen data: 118.670348 s

Accuracy on shuffled data: 0.576718
Runtime on shuffled data: 0.527670
Infer time on shuffled data: 86.107631


## Inference using 15 features at a time

In [0]:
def train(train,val,edges):
    model = BayesianModel(edges)
    train_time = time.time()
    model.fit(train)
    train_time = time.time()-train_time
    predict_data = val.copy()
    predict_time = time.time()
    predict_data = predict_data[model.nodes()]
    predict_data.drop('op', axis=1, inplace=True)
    l = []
    for i,j in predict_data.iterrows():
        l.append(predict(model,j))
    predict_time = time.time()-predict_time
    y_pred = pd.DataFrame(l)
    y_pred.columns = ['op']
    acc = accuracy(y_pred['op'],val['op'])
    return model,acc,train_time,predict_time

In [0]:
def train_all(edges):
    seen_model = train(seen_train,seen_val,edges)
    unseen_model = train(unseen_train,unseen_val,edges)
    shuffled_model = train(shuffled_train,shuffled_val,edges)
    return seen_model,unseen_model,shuffled_model

In [0]:
## Inference using half on the edges

def predict(model,features):
    inference = VariableElimination(model)
    left_nodes = [i for i in model.nodes() if i[0]=='f']
    right_nodes = [i for i in model.nodes() if i[0]=='g']
    left_img = {}
    for n in left_nodes:
        left_img[n]=features[n]
    right_img = {}
    for n in right_nodes:
        right_img[n]=features[n]
    ql0 = inference.query(left_nodes,evidence=right_img.update({'op':0}))
    ql1 = inference.query(left_nodes,evidence=right_img.update({'op':1}))
    qr0 = inference.query(right_nodes,evidence=left_img.update({'op':0}))    
    qr1 = inference.query(right_nodes,evidence=left_img.update({'op':1}))    
    pl0 = pl1= pr0 = pr1 = 1
    for n in left_nodes:
        pl0 *= ((ql0[n]).values)[left_img[n]]
        pl1 *= ((ql1[n]).values)[left_img[n]]
    for n in right_nodes:
        pr0 *= ((qr0[n]).values)[right_img[n]]
        pr1 *= ((qr1[n]).values)[right_img[n]]
    p0 = (pl0*pr0)**0.5
    p1 = (pl1*pr1)**0.5
    
    return int(p0<p1)

In [0]:
seen_model, unseen_model, shuffled_model = train_all(edges)

In [0]:
print("Inferencing using features of one image at once\n")
print("Accuracy on seen data: %f"%seen_model[1])
print("Runtime on seen data: %f s"%seen_model[2])
print("Infer time on seen data: %f s"%seen_model[3])
print()
print("Accuracy on unseen data: %f"%unseen_model[1])
print("Runtime on unseen data: %f s"%unseen_model[2])
print("Infer time on unseen data: %f s"%unseen_model[3])
print()
print("Accuracy on shuffled data: %f"%shuffled_model[1])
print("Runtime on shuffled data: %f"%shuffled_model[2])
print("Infer time on shuffled data: %f"%shuffled_model[3])

Inferencing using features of one image at once

Accuracy on seen data: 0.502237
Runtime on seen data: 0.561625 s
Infer time on seen data: 65.061688 s

Accuracy on unseen data: 0.507132
Runtime on unseen data: 0.636577 s
Infer time on unseen data: 523.308913 s

Accuracy on shuffled data: 0.499046
Runtime on shuffled data: 0.538977
Infer time on shuffled data: 377.854292


## Model for Task 4 using similarity node

In [0]:
import networkx as nx
import matplotlib.pyplot as plt

In [0]:
features = pd.read_csv('../15features.csv')
features.head(5)

Unnamed: 0,imagename,pen_pressure,letter_spacing,size,dimension,is_lowercase,is_continuous,slantness,tilt,entry_stroke_a,staff_of_a,formation_n,staff_of_d,exit_stroke_d,word_formation,constancy
0,0968c_num1.png,2,2,2,1,2,2,3,2,1,2,2,3,2,2,1
1,0809c_num2.png,2,2,2,2,2,2,3,1,1,2,2,3,1,2,2
2,0237b_num6.png,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2
3,0069b_num2.png,2,2,2,1,2,2,1,1,1,1,1,1,1,1,1
4,0966c_num4.png,2,2,2,2,2,2,2,1,1,2,2,2,2,2,2


In [0]:
def datagen(datadir):
    training_pairs = pd.read_csv(join(datadir,'dataset_seen_training_siamese.csv'))
    validation_pairs = pd.read_csv(join(datadir,'dataset_seen_validation_siamese.csv'))
    training_features_pairs_info = pd.merge(training_pairs,features,left_on='left',right_on='imagename')
    training_features_pairs_info = pd.merge(training_features_pairs_info,
                                            features,left_on='right',
                                            right_on='imagename', 
                                            suffixes=('1', '2'))
    training_features_pairs_info = training_features_pairs_info.drop(training_features_pairs_info.columns[[0,4,20]],
                                                                     axis=1)
    training_features_pairs = training_features_pairs_info.drop(training_features_pairs_info.columns[[0,1]],
                                                           axis=1)
    
    validation_features_pairs_info = pd.merge(validation_pairs,features,left_on='left',right_on='imagename')
    validation_features_pairs_info = pd.merge(validation_features_pairs_info,features,
                                              left_on='right',
                                              right_on='imagename', 
                                              suffixes=('1', '2'))
    validation_features_pairs_info = validation_features_pairs_info.drop(validation_features_pairs_info.columns[[0,4,20]],
                                                                         axis=1)
    validation_features_pairs = validation_features_pairs_info.drop(validation_features_pairs_info.columns[[0,1]],axis=1)
    return training_features_pairs,validation_features_pairs



In [0]:
def train_similarity_model(datadir):
    
    combined_model = BayesianModel([('is_lowercase1','is_lowercase_sim'),
                                    ('is_lowercase2','is_lowercase_sim'),
                                    ('is_continuous1','is_continuous_sim'),
                                    ('is_continuous2','is_continuous_sim'),              
                                    ('dimension1','dimension_sim'),
                                    ('dimension2','dimension_sim'),
                                    ('letter_spacing1','letter_spacing_sim'),
                                    ('letter_spacing2','letter_spacing_sim'),
                                    ('size1','size_sim'),
                                    ('size2','size_sim'),              
                                    ('constancy1','constancy_sim'),
                                    ('constancy2','constancy_sim'),
                                    ('word_formation1','word_formation_sim'),
                                    ('word_formation2','word_formation_sim'),
                                    ('formation_n1','formation_n_sim'),
                                    ('formation_n2','formation_n_sim'),
                                    ('entry_stroke_a1','entry_stroke_a_sim'),
                                    ('entry_stroke_a2', 'entry_stroke_a_sim'),
                                    ('exit_stroke_d1','exit_stroke_d_sim'),
                                    ('exit_stroke_d2', 'exit_stroke_d_sim'),
                                    ('staff_of_a1','staff_of_a_sim'),
                                    ('staff_of_a2', 'staff_of_a_sim'),
                                    ('staff_of_d1','staff_of_d_sim'),
                                    ('staff_of_d2', 'staff_of_d_sim'),
                                    ('slantness1','slantness_sim'),
                                    ('slantness2', 'slantness_sim'),
                                    ('tilt1','tilt_sim'),
                                    ('tilt2', 'tilt_sim'),
                                    ('pen_pressure1','pen_pressure_sim'),
                                    ('pen_pressure2', 'pen_pressure_sim'),
                                    
                                    ('is_lowercase_sim','is_continuous_sim'),
                                    ('dimension_sim','size_sim'),
                                    ('letter_spacing_sim','size_sim'),
                                    ('size_sim','constancy_sim'),
                                    ('constancy_sim','word_formation_sim'),
                                    ('word_formation_sim','formation_n_sim'),
                                    ('entry_stroke_a_sim','exit_stroke_d_sim'),
                                    ('staff_of_a_sim','staff_of_d_sim'),
                                    ('slantness_sim','tilt_sim'),
                                    
                                    ('is_continuous_sim','label'),
                                    ('formation_n_sim','label'),
                                    ('exit_stroke_d_sim','label'),
                                    ('staff_of_d_sim','label'),
                                    ('pen_pressure_sim','label'),
                                    ('tilt_sim','label')
                                   ])
    # nx.draw(combined_model, **options)
    
    cpd_is_lowercase1 = TabularCPD('is_lowercase1',2,[[0.5],
                                                    [0.5]],
                                                    evidence=[], evidence_card=[])
    cpd_is_lowercase2 = TabularCPD('is_lowercase2',2,[[0.5],
                                                    [0.5]],
                                                    evidence=[], evidence_card=[])
    cpd_is_continuous1 = TabularCPD('is_continuous1',2,[[0.5],
                                                    [0.5]],
                                                    evidence=[], evidence_card=[])
    cpd_is_continuous2 = TabularCPD('is_continuous2',2,[[0.5],
                                                    [0.5]],
                                                    evidence=[], evidence_card=[])
    cpd_dimension1 = TabularCPD('dimension1',3,[[0.33],
                                        [0.34],[0.33]],
                                                    evidence=[], evidence_card=[])
    cpd_dimension2 = TabularCPD('dimension2',3,[[0.33],
                                        [0.34],[0.33]],
                                                    evidence=[], evidence_card=[])
    cpd_letter_spacing1 = TabularCPD('letter_spacing1',3,[[0.33],
                                        [0.34],[0.33]],
                                        evidence=[], evidence_card=[])
    cpd_letter_spacing2 = TabularCPD('letter_spacing2',3,[[0.33],
                                        [0.34],[0.33]],
                                        evidence=[], evidence_card=[])
    cpd_size1 = TabularCPD('size1',3,[[0.33],
                                        [0.34],[0.33]],
                                        evidence=[], evidence_card=[])
    cpd_size2 = TabularCPD('size2',3,[[0.33],
                                        [0.34],[0.33]],
                                        evidence=[], evidence_card=[])
    cpd_constancy1 = TabularCPD('constancy1',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_constancy2 = TabularCPD('constancy2',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_word_formation1 = TabularCPD('word_formation1',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_word_formation2 = TabularCPD('word_formation2',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_formation_n1 = TabularCPD('formation_n1',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_formation_n2 = TabularCPD('formation_n2',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_entry_stroke_a1 = TabularCPD('entry_stroke_a1',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_entry_stroke_a2 = TabularCPD('entry_stroke_a2',2,[[0.5],
                                        [0.5]],
                                        evidence=[], evidence_card=[])
    cpd_exit_stroke_d1 = TabularCPD('exit_stroke_d1',4,[[0.3],
                                                    [0.5],[0.1],
                                                    [0.1]],
                                        evidence=[], evidence_card=[])
    cpd_exit_stroke_d2 = TabularCPD('exit_stroke_d2',4,[[0.3],
                                                    [0.5],[0.1],
                                                    [0.1]],
                                        evidence=[], evidence_card=[])
    cpd_staff_of_a1 = TabularCPD('staff_of_a1',4,[[0.2],[0.6],
                                            [0.05],[0.15]],
                                        evidence=[], evidence_card=[])
    cpd_staff_of_a2 = TabularCPD('staff_of_a2',4,[[0.2],[0.6],
                                            [0.05],[0.15]],
                                        evidence=[], evidence_card=[])
    cpd_staff_of_d1 = TabularCPD('staff_of_d1',2,[[0.1],
                                        [0.9]],
                                        evidence=[], evidence_card=[])
    cpd_staff_of_d2 = TabularCPD('staff_of_d2',2,[[0.1],
                                        [0.9]],
                                        evidence=[], evidence_card=[])
    cpd_slantness1 = TabularCPD('slantness1',4,[[0.55],[0.30],
                                            [0.10],[0.05]],
                                        evidence=[], evidence_card=[])
    cpd_slantness2 = TabularCPD('slantness2',4,[[0.55],[0.30],
                                            [0.10],[0.05]],
                                        evidence=[], evidence_card=[])
    cpd_tilt1 = TabularCPD('tilt1',2,[[0.8],[0.2]],
                                        evidence=[], evidence_card=[])
    cpd_tilt2 = TabularCPD('tilt2',2,[[0.8],[0.2]],
                                        evidence=[], evidence_card=[])
    cpd_pen_pressure1 = TabularCPD('pen_pressure1',2,[[0.4],[0.6]],
                                        evidence=[], evidence_card=[])
    cpd_pen_pressure2 = TabularCPD('pen_pressure2',2,[[0.4],[0.6]],
                                        evidence=[], evidence_card=[])
    
    
    
    cpd_is_lowercase_sim = TabularCPD('is_lowercase_sim',2,[[0.1,0.9,0.9,0.1],
                                                                [0.9,0.1,0.1,0.9]],
                                                                evidence=['is_lowercase1','is_lowercase2'], 
                                                                evidence_card=[2,2])
    cpd_is_continuous_sim = TabularCPD('is_continuous_sim',2,[[0.9,0.1,0.9,0.6,0.9,0.6,0.9,0.1],
                                                              [0.1,0.9,0.1,0.4,0.1,0.4,0.1,0.9]],
                                                                evidence=['is_continuous1','is_continuous2','is_lowercase_sim'], 
                                                                evidence_card=[2,2,2])
    cpd_dimension_sim = TabularCPD('dimension_sim',2,[[0.1,0.8,0.9,0.8,0.1,0.8,0.9,0.8,0.1],
                                                      [0.9,0.2,0.1,0.2,0.9,0.2,0.1,0.2,0.9]],
                                                    evidence=['dimension1','dimension2'], evidence_card=[3,3])
    cpd_letter_spacing_sim = TabularCPD('letter_spacing_sim',2,[[0.1,0.8,0.9,0.8,0.1,0.8,0.9,0.8,0.1],
                                                    [0.9,0.2,0.1,0.2,0.9,0.2,0.1,0.2,0.9]],
                                                    evidence=['letter_spacing1','letter_spacing2'], evidence_card=[3,3])
    cpd_size_sim = TabularCPD('size_sim',2,[[0.6,0.3,0.3,0.1,0.8,0.7,0.7,0.3,0.9,0.8,0.7,0.4,0.7,0.6,0.6,0.3,0.6,0.3,0.3,0.1,0.8,0.4,0.4,0.85,0.9,0.8,0.8,0.3,0.8,0.4,0.4,0.85,0.6,0.3,0.3,0.1],
                                            [0.4,0.7,0.7,0.9,0.2,0.3,0.3,0.7,0.1,0.2,0.3,0.6,0.3,0.4,0.4,0.7,0.4,0.7,0.7,0.9,0.2,0.6,0.6,0.15,0.1,0.2,0.2,0.7,0.2,0.6,0.6,0.15,0.4,0.7,0.7,0.9]],
                                            evidence=['size1','size2','dimension_sim','letter_spacing_sim'], evidence_card=[3,3,2,2])
    cpd_constancy_sim = TabularCPD('constancy_sim',2,[[0.9,0.1,0.9,0.6,0.9,0.6,0.7,0.1],
                                            [0.1,0.9,0.1,0.4,0.1,0.4,0.3,0.9]],
                                            evidence=['constancy1','constancy2','size_sim'], evidence_card=[2,2,2])
    cpd_word_formation_sim = TabularCPD('word_formation_sim',2,[[0.9,0.1,0.9,0.7,0.9,0.7,0.9,0.1],
                                            [0.1,0.9,0.1,0.3,0.1,0.3,0.1,0.9]],
                                            evidence=['word_formation1','word_formation2','constancy_sim'], evidence_card=[2,2,2])
    cpd_formation_n_sim = TabularCPD('formation_n_sim',2,[[0.7,0.1,0.9,0.4,0.9,0.4,0.6,0.1],
                                            [0.3,0.9,0.1,0.6,0.1,0.6,0.4,0.9]],
                                            evidence=['formation_n1','formation_n2','word_formation_sim'], evidence_card=[2,2,2])
    cpd_entry_stroke_a_sim = TabularCPD('entry_stroke_a_sim',2,[[0.1,0.9,0.9,0.1],
                                                                [0.9,0.1,0.1,0.9]],
                                            evidence=['entry_stroke_a1','entry_stroke_a2'], evidence_card=[2,2])
    
    cpd_exit_stroke_d_sim = TabularCPD('exit_stroke_d_sim',2,[[0.47, 0.53, 0.79, 0.21, 0.88, 0.12, 0.88, 0.12, 0.42, 0.58, 0.26, 0.74, 0.52, 0.48, 0.42, 0.58, 0.84, 0.16, 0.78, 0.22, 0.36, 0.64, 0.55, 0.45, 0, 1, 0, 1, 0, 1, 0, 1],
                                                              [0.53, 0.47, 0.21, 0.79, 0.12, 0.88, 0.12, 0.88, 0.58, 0.42, 0.74, 0.26, 0.48, 0.52, 0.58, 0.42, 0.16, 0.84, 0.22, 0.78, 0.64, 0.36, 0.45, 0.55, 1, 0, 1, 0, 1, 0, 1, 0]],
                                            evidence=['exit_stroke_d1','exit_stroke_d2','entry_stroke_a_sim'], evidence_card=[4,4,2])
    cpd_staff_of_d_sim = TabularCPD('staff_of_d_sim',2,[[0.9,0.1,0.9,0.7,0.9,0.7,0.9,0.1],
                                            [0.1,0.9,0.1,0.3,0.1,0.3,0.1,0.9]],
                                            evidence=['staff_of_d1','staff_of_d2','staff_of_a_sim'], evidence_card=[2,2,2])
    cpd_tilt_sim = TabularCPD('tilt_sim',2,[[0.9,0.1,0.9,0.7,0.9,0.7,0.9,0.1],
                                            [0.1,0.9,0.1,0.3,0.1,0.3,0.1,0.9]],
                                            evidence=['tilt1','tilt2','slantness_sim'], evidence_card=[2,2,2])
    cpd_pen_pressure_sim = TabularCPD('pen_pressure_sim',2,[[0.1,0.9,0.9,0.1],
                                                           [0.9,0.1,0.1,0.9]],
                                            evidence=['pen_pressure1','pen_pressure2'], evidence_card=[2,2])
    cpd_staff_of_a_sim = TabularCPD('staff_of_a_sim',2,[[0.63, 0.86, 0.87, 0.79, 0.23, 0.13, 0.17, 0.17, 0, 0, 0, 0, 0.60, 0.61, 0.78, 0.39],
                                                  [0.37, 0.14, 0.13, 0.21, 0.77, 0.87, 0.83, 0.83, 1, 1, 1, 1, 0.40, 0.39, 0.22, 0.61]],
                                            evidence=['staff_of_a1','staff_of_a2'], evidence_card=[4,4])
    cpd_slantness_sim = TabularCPD('slantness_sim',2,[[0.63, 0.86, 0.87, 0.79, 0.23, 0.13, 0.17, 0.17, 0, 0, 0, 0, 0.60, 0.61, 0.78, 0.39],
                                                  [0.37, 0.14, 0.13, 0.21, 0.77, 0.87, 0.83, 0.83, 1, 1, 1, 1, 0.40, 0.39, 0.22, 0.61]],
                                            evidence=['slantness1','slantness2'], evidence_card=[4,4])
    rand_prob = list(np.around(np.random.uniform(0.0,1.0,64),2))
    cpd_label = TabularCPD('label',2,[[0.8580167417900838, 0.8580167417900838, 0.6366459627329193, 0.6366459627329193, 0.6366459627329192, 0.6366459627329192, 0.33687705412837876, 0.33687705412837876, 0.8453901800831153, 0.8453901800831153, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.3149100257069408, 0.3149100257069408, 0.8580167417900838, 0.8580167417900838, 0.6366459627329193, 0.6366459627329193, 0.6366459627329192, 0.6366459627329192, 0.33687705412837876, 0.33687705412837876, 0.8453901800831153, 0.8453901800831153, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.3149100257069408, 0.3149100257069408, 0.6850899742930591, 0.6850899742930591, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.15460981991688466, 0.15460981991688466, 0.6631229458716212, 0.6631229458716212, 0.3633540372670807, 0.3633540372670807, 0.36335403726708076, 0.36335403726708076, 0.14198325820991628, 0.14198325820991628, 0.6850899742930591, 0.6850899742930591, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.15460981991688466, 0.15460981991688466, 0.6631229458716212, 0.6631229458716212, 0.3633540372670807, 0.3633540372670807, 0.36335403726708076, 0.36335403726708076, 0.14198325820991628, 0.14198325820991628 ],
                                     [0.14198325820991628, 0.14198325820991628, 0.36335403726708076, 0.36335403726708076, 0.36335403726708076, 0.36335403726708076, 0.6631229458716211, 0.6631229458716211, 0.15460981991688466, 0.15460981991688466, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.6850899742930591, 0.6850899742930591, 0.14198325820991628, 0.14198325820991628, 0.36335403726708076, 0.36335403726708076, 0.36335403726708076, 0.36335403726708076, 0.6631229458716211, 0.6631229458716211, 0.15460981991688466, 0.15460981991688466, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.6850899742930591, 0.6850899742930591, 0.3149100257069408, 0.3149100257069408, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.8453901800831153, 0.8453901800831153, 0.33687705412837887, 0.33687705412837887, 0.6366459627329193, 0.6366459627329193, 0.6366459627329193, 0.6366459627329193, 0.8580167417900838, 0.8580167417900838, 0.3149100257069408, 0.3149100257069408, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.8453901800831153, 0.8453901800831153, 0.33687705412837887, 0.33687705412837887, 0.6366459627329193, 0.6366459627329193, 0.6366459627329193, 0.6366459627329193, 0.8580167417900838, 0.8580167417900838 ]],
                           evidence=['is_continuous_sim','exit_stroke_d_sim','formation_n_sim','staff_of_d_sim','tilt_sim','pen_pressure_sim'],
                           evidence_card=[2,2,2,2,2,2])
    #cpd_label = TabularCPD('label',2,[[0.1,0.2],
    #                                 [0.9,0.8]],
    #                       evidence=['is_continuous_sim'],
    #                       evidence_card=[2])
    
    combined_model.add_cpds(cpd_pen_pressure1,
                            cpd_pen_pressure2,
                            cpd_pen_pressure_sim,
                            cpd_letter_spacing1,
                            cpd_letter_spacing2,
                            cpd_letter_spacing_sim,
                            cpd_size1,
                            cpd_size2,
                            cpd_size_sim,
                            cpd_dimension1,
                            cpd_dimension2,
                            cpd_dimension_sim,
                            cpd_is_lowercase1,
                            cpd_is_lowercase2,
                            cpd_is_lowercase_sim,
                            cpd_is_continuous1,
                            cpd_is_continuous2,
                            cpd_is_continuous_sim,
                            cpd_slantness1,
                            cpd_slantness2,
                            cpd_slantness_sim,
                            cpd_tilt1,
                            cpd_tilt2,
                            cpd_tilt_sim,
                            cpd_entry_stroke_a1,
                            cpd_entry_stroke_a2,
                            cpd_entry_stroke_a_sim,
                            cpd_staff_of_a1,
                            cpd_staff_of_a2,
                            cpd_staff_of_a_sim,
                            cpd_formation_n1,
                            cpd_formation_n2,
                            cpd_formation_n_sim,
                            cpd_staff_of_d1,
                            cpd_staff_of_d2,
                            cpd_staff_of_d_sim,
                            cpd_exit_stroke_d1,
                            cpd_exit_stroke_d2,
                            cpd_exit_stroke_d_sim,
                            cpd_word_formation1,
                            cpd_word_formation2,
                            cpd_word_formation_sim,
                            cpd_constancy1,
                            cpd_constancy2,
                            cpd_constancy_sim,
                            cpd_label
                           )
    train,val = datagen(datadir)
    runtime = time.time()
    pred =  combined_model.predict(val.drop(['label'],axis=1)-2)['label']
    runtime = time.time() - runtime
    return combined_model, accuracy(pred,val['label']), runtime

In [0]:
seen = train_similarity_model('../seen-dataset')

In [0]:
unseen = train_similarity_model('../unseen-dataset')

In [0]:
shuffled = train_similarity_model('../shuffled-dataset')

In [0]:
print("Inferencing hardcoded CPDs and similarity node\n")
print("Accuracy on seen data: %f"%seen[1])
print("Infer on seen data: %f s"%seen[2])
print()
print("Accuracy on unseen data: %f"%unseen[1])
print("Infer time on unseen data: %f s"%unseen[2])
print()
print("Accuracy on shuffled data: %f"%shuffled[1])
print("Infer time on shuffled data: %f"%shuffled[2])

Inferencing hardcoded CPDs and similarity node

Accuracy on seen data: 0.697987
Infer on seen data: 131.942239 s

Accuracy on unseen data: 0.614596
Infer time on unseen data: 1057.064860 s

Accuracy on shuffled data: 0.702672
Infer time on shuffled data: 765.363711


## Loose estimation of CPD values for verification node

In [0]:
e1 = TabularCPD('is_continuous_sim',2,[[0.9,0.1,0.9,0.6,0.9,0.6,0.9,0.1],
                                                              [0.1,0.9,0.1,0.4,0.1,0.4,0.1,0.9]],
                                                                evidence=['is_continuous1','is_continuous2','is_lowercase_sim'], 
                                                                evidence_card=[2,2,2])


In [0]:
e2 = TabularCPD('exit_stroke_d_sim',2,[[0.47, 0.53, 0.79, 0.21, 0.88, 0.12, 0.88, 0.12, 0.42, 0.58, 0.26, 0.74, 0.52, 0.48, 0.42, 0.58, 0.84, 0.16, 0.78, 0.22, 0.36, 0.64, 0.55, 0.45, 0, 1, 0, 1, 0, 1, 0, 1],
                                                              [0.53, 0.47, 0.21, 0.79, 0.12, 0.88, 0.12, 0.88, 0.58, 0.42, 0.74, 0.26, 0.48, 0.52, 0.58, 0.42, 0.16, 0.84, 0.22, 0.78, 0.64, 0.36, 0.45, 0.55, 1, 0, 1, 0, 1, 0, 1, 0]],
                                            evidence=['exit_stroke_d1','exit_stroke_d2','entry_stroke_a_sim'], evidence_card=[4,4,2])

In [0]:
e3 = TabularCPD('formation_n_sim',2,[[0.7,0.1,0.9,0.4,0.9,0.4,0.6,0.1],
                                            [0.3,0.9,0.1,0.6,0.1,0.6,0.4,0.9]],
                                            evidence=['formation_n1','formation_n2','word_formation_sim'], evidence_card=[2,2,2])

In [0]:
e4 = TabularCPD('staff_of_d_sim',2,[[0.9,0.1,0.9,0.7,0.9,0.7,0.9,0.1],
                                            [0.1,0.9,0.1,0.3,0.1,0.3,0.1,0.9]],
                                            evidence=['staff_of_d1','staff_of_d2','staff_of_a_sim'], evidence_card=[2,2,2])

In [0]:
e5 = TabularCPD('tilt_sim',2,[[0.9,0.1,0.9,0.7,0.9,0.7,0.9,0.1],
                                            [0.1,0.9,0.1,0.3,0.1,0.3,0.1,0.9]],
                                            evidence=['tilt1','tilt2','slantness_sim'], evidence_card=[2,2,2])

In [0]:
 e6 = TabularCPD('pen_pressure_sim',2,[[0.1,0.9,0.9,0.1],
                                                           [0.9,0.1,0.1,0.9]],
                                            evidence=['pen_pressure1','pen_pressure2'], evidence_card=[2,2])

In [0]:
val = np.ones((2,64))
i=0
for a in range(2):
    for b in range(2):
        for c in range(2):
            for d in range(2):
                for e in range(2):
                    for f in range(2):
                        l = [e1.values[a],e2.values[b],e3.values[c],e4.values[d],e5.values[e],e6.values[f]]
                        val[0][i]=np.product([np.mean(t) for t in l])
                        val[1][i]=np.product([1-np.mean(t) for t in l])
                        total = (val[0][i]+val[1][i])
                        val[0][i]/=total
                        val[1][i]/=total
                        i+=1

In [0]:
pd.DataFrame(val)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.858017,0.858017,0.636646,0.636646,0.636646,0.636646,0.336877,0.336877,0.84539,0.84539,...,0.15461,0.15461,0.663123,0.663123,0.363354,0.363354,0.363354,0.363354,0.141983,0.141983
1,0.141983,0.141983,0.363354,0.363354,0.363354,0.363354,0.663123,0.663123,0.15461,0.15461,...,0.84539,0.84539,0.336877,0.336877,0.636646,0.636646,0.636646,0.636646,0.858017,0.858017


In [0]:
for i in range(len(val)):
    print("[",end='')
    for j in range(len(val[0])):
        print(val[i][j],end=', ')
    print("],")

[0.8580167417900838, 0.8580167417900838, 0.6366459627329193, 0.6366459627329193, 0.6366459627329192, 0.6366459627329192, 0.33687705412837876, 0.33687705412837876, 0.8453901800831153, 0.8453901800831153, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.3149100257069408, 0.3149100257069408, 0.8580167417900838, 0.8580167417900838, 0.6366459627329193, 0.6366459627329193, 0.6366459627329192, 0.6366459627329192, 0.33687705412837876, 0.33687705412837876, 0.8453901800831153, 0.8453901800831153, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.6132075471698113, 0.3149100257069408, 0.3149100257069408, 0.6850899742930591, 0.6850899742930591, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.38679245283018865, 0.15460981991688466, 0.15460981991688466, 0.6631229458716212, 0.6631229458716212, 0.3633540372670807, 0.3633540372670807, 0.36335403726708076, 0.36335403726708076, 0.14198325820991628, 0.14198325820991628, 0.6850899742930591, 0.685