### Getting data from yann lecunn dataset

Yann Lecunn dataset is of the format idx and split across 4 files 
1. train-images.idx3-ubyte : training image set which consists of 60000 images each image is represented by a 28*28 array
2. train-labels.idx1-ubyte : training label set which consists of 60000 labels 
3. t10k-images.idx3-ubyte : test image set which consists of 10000 images each image is represented by a 28*28
4. t10k-labels.idx1-ubyte : training label set which consists of 10000 labels

Get the dataset from http://yann.lecun.com/exdb/mnist/

In [1]:
import idx2numpy
import seaborn as sns

#### Extract Training data

In [2]:
X_train_3D = idx2numpy.convert_from_file('/root/decision-tree-python/train-images.idx3-ubyte')
X_train = X_train_3D.flatten().reshape(60000,784)

y_train = idx2numpy.convert_from_file('/root/decision-tree-python/train-labels.idx1-ubyte')
X_train.shape, y_train.shape

((60000, 784), (60000,))

#### Extract Test data

In [3]:
X_test_3D = idx2numpy.convert_from_file('/root/decision-tree-python/t10k-images.idx3-ubyte')
X_test =  X_test_3D.flatten().reshape(10000,784)

y_test = idx2numpy.convert_from_file('/root/decision-tree-python/t10k-labels.idx1-ubyte')
X_test.shape, y_test.shape

((10000, 784), (10000,))

# Decision Tree

In [4]:
import numpy as np
from sklearn.utils import shuffle
X_shuffle,y_shuffle = shuffle(X_train,y_train)
X_train = X_shuffle[0:50000]
y_train = y_shuffle[0:50000]

from sklearn import tree
from sklearn.model_selection import cross_val_predict

#dt_clf = tree.DecisionTreeClassifier(max_depth=20, max_leaf_nodes=300)
#dt_clf = tree.DecisionTreeClassifier(max_depth=4, max_leaf_nodes=20)
dt_clf = tree.DecisionTreeClassifier(max_depth=8)

#y_train_pred = cross_val_predict(dt_clf, X_train, y_train, cv=2)
print(dt_clf.fit(X_train, y_train))

print('Number of nodes in the decision tree {}.'.format(dt_clf.tree_.node_count))
print('Number of threshold in the decision tree {}.'.format(len(dt_clf.tree_.threshold)))
print('Number of leaves in the decision tree {}.'.format(dt_clf.tree_.n_leaves))

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, dt_clf.predict(X_test)), accuracy_score(y_train, dt_clf.predict(X_train)))

DecisionTreeClassifier(max_depth=8)
Number of nodes in the decision tree 507.
Number of threshold in the decision tree 507.
Number of leaves in the decision tree 254.
0.8232 0.83208


In [5]:
threshold = dt_clf.tree_.threshold
import pandas as pd
df_train = pd.DataFrame(data = X_train, columns = range(X_train[0].shape[0]))
df_test = pd.DataFrame(data = X_test, columns = range(X_test[0].shape[0]))
df_train.shape, df_test.shape
df = pd.concat([df_train, df_test])
unique_vals = []
for i in df.columns:
    unique_vals.append(df[i].unique())
flatten_list = np.concatenate(unique_vals).ravel()
print('Max and Min values of thresholds in decision tree are', max([int(i) for i in list(set(threshold))]), min([int(i) for i in list(set(threshold))]))

Max and Min values of thresholds in decision tree are 253 -2


### Displaying the tree

In [6]:
from sklearn.tree import export_text
text_representation = export_text(dt_clf)
print(text_representation, dt_clf.classes_)

|--- feature_350 <= 131.50
|   |--- feature_568 <= 0.50
|   |   |--- feature_430 <= 0.50
|   |   |   |--- feature_405 <= 4.50
|   |   |   |   |--- feature_485 <= 12.50
|   |   |   |   |   |--- feature_154 <= 0.50
|   |   |   |   |   |   |--- feature_594 <= 21.50
|   |   |   |   |   |   |   |--- feature_156 <= 0.50
|   |   |   |   |   |   |   |   |--- class: 7
|   |   |   |   |   |   |   |--- feature_156 >  0.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- feature_594 >  21.50
|   |   |   |   |   |   |   |--- feature_408 <= 15.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- feature_408 >  15.00
|   |   |   |   |   |   |   |   |--- class: 3
|   |   |   |   |   |--- feature_154 >  0.50
|   |   |   |   |   |   |--- feature_566 <= 85.50
|   |   |   |   |   |   |   |--- feature_571 <= 4.50
|   |   |   |   |   |   |   |   |--- class: 3
|   |   |   |   |   |   |   |--- feature_571 >  4.50
|   |   |   |   |   |   |   |   |--- class

### Sequential Circuit HDL: FSM Generator   (Run this cell twice)

In [8]:
%%capture cap --no-stderr
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}{}:if ({} <= {})".format(indent, node, name, int(round(threshold,3)))) 
            recurse(tree_.children_left[node], depth + 1)
            print ("{}{}:else ".format(indent, node, name, int(round(threshold,3))))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{} Label<={};".format(indent, np.argmax(tree_.value[node][0],axis=0)))

    recurse(0, 1)

cols = range(784)
features = ['pixels[{}]'.format(str(i)) for i in cols]
class_names = [str(i) for i in dt_clf.classes_]
tree_to_code(dt_clf, features)

with open('verilog_newFSM.txt', 'w') as f:
    f.write(cap.stdout)

In [9]:
import pdb
def write_file(data_towrite):
    with open('verilog_file.v', 'a', encoding='utf-8') as file:
        file.writelines(data_towrite)
        file.close()
               
def clear_file():
    with open('verilog_file.v', 'w', encoding='utf-8') as file:
        pass
        file.close()

import itertools
with open('verilog_newFSM.txt', 'r') as f:
    tree_verilog = f.read()
    f.close()

clear_file()
line_num = 0
else_state = 0
else_nextif_state = 0
curr_line,next_line = itertools.tee(tree_verilog.split('\n'))
next(next_line, None)
for i,j in list(zip(curr_line,next_line)):
    if 'Label' in i:
        pass
        #write_file(f'  begin {i.strip()} state<=0;ml_inference_completed<=1; end \n')
    elif 'else' in i and 'Label' in j:
        else_state = i.strip().split(':')[0]
        line_num = search_content_file(' '+str(else_state)+':if' )
        write_line_file(f'\n       else begin {j.strip()} state<=0;ml_inference_completed<=1; end ', line_num)
        #write_file(f'   {i.strip().split(":")[-1]} ')
    elif 'else' in i and 'if' in j:
        else_state = i.strip().split(':')[0]
        else_nextif_state = j.strip().split(':')[0]
        line_num = search_content_file(' '+str(else_state)+':if' )
        write_line_file(f'else begin state<={else_nextif_state}; end ', line_num)
    elif 'if' in i and 'Label' in j:
        write_file(f'\n {i.strip()} begin {j.strip()} state<=0;ml_inference_completed<=1; end ')
    elif 'if' in i and 'if' in j:
        write_file(f'\n {i.strip()} begin state<={j.strip().split(":")[0]}; end \n ')  
update_stateformat_file()

### Verifying the number of "if", "else" and "Label" in verilog with decision tree architecture

In [10]:
#verifying that the number of "if" statements is one less than the number of nodes.
file1 = open('verilog_file.v', 'r')
contents = file1.readlines()
counter_if =0
for line in contents:
    if 'if' in line:
        counter_if = counter_if+1
print('Number of "if" statements {}.'.format(counter_if))

#verifying that the number of "else" statements is one less than the number of leaves.
file1 = open('verilog_file.v', 'r')
contents = file1.readlines()
counter_else =0
for line in contents:
    if 'else' in line:
        counter_else = counter_else+1
print('Number of "else" statements {}.'.format(counter_else))

#verifying that the number of "Label" statements is equal to the number of leaves.
file1 = open('verilog_file.v', 'r')
contents = file1.readlines()
counter_return =0
for line in contents:
    if 'Label' in line:
        counter_return = counter_return+1
print('Number of "label" statements {}.'.format(counter_return))

Number of "if" statements 253.
Number of "else" statements 253.
Number of "label" statements 254.


### Convert the tree into the source code by rounding the threshold to nearest integer and save it in a py file. This is performed to verify the accuracy by using this dumped decision rules.

#### Run this cell twice.

In [12]:
%%capture cap --no-stderr
#https://towardsdatascience.com/scikit-learn-decision-trees-explained-803f3812290d
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def decision_tree_inference({}):".format('feature_set'))
    for i,pixel in enumerate(feature_names):
            print ("{}{}".format("  ", pixel+'='+'feature_set['+str(i)+']'))
    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, int(round(threshold,3))))  #convert the threshold to integer
           
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, np.argmax(tree_.value[node][0],axis=0)))

    recurse(0, 1)

cols = range(784)
features = ['pixel'+str(i) for i in cols]
class_names = [str(i) for i in dt_clf.classes_]
tree_to_code(dt_clf, features)

with open('mnist_decision_tree_inference.py', 'w') as f:
    f.write(cap.stdout)

In [13]:
from mnist_decision_tree_inference import decision_tree_inference
y_test_pred_tree = []
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(decision_tree_inference(test_samples))

y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(decision_tree_inference(test_samples))

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

0.8232 0.83208


## Logic Locking: XOR key gates at each node

#### Run this cell twice.

In [15]:
%%capture cap --no-stderr
#https://towardsdatascience.com/scikit-learn-decision-trees-explained-803f3812290d
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def dtLOCKED_AllXOR({},{}):".format('feature_set','key'))
    for i,pixel in enumerate(feature_names):
            print ("{}{}".format("  ", pixel+'='+'feature_set['+str(i)+']'))
    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            #XOR Operation
            print ("{}if {} <= {} and ({} <= {}) ^ key==({} <= {})  :".format(indent, name, int(round(threshold,3)),name, int(round(threshold,3)),name, int(round(threshold,3))))  #convert the threshold to integer
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, np.argmax(tree_.value[node][0],axis=0)))

    recurse(0, 1)

cols = range(784)
features = ['pixel'+str(i) for i in cols]
class_names = [str(i) for i in dt_clf.classes_]
tree_to_code(dt_clf, features)

with open('mnist_DTLock_AllXOR.py', 'w') as f:
    f.write(cap.stdout)

###### Verify the accuracy by using this dumped decision rules with correct/incorrect key.

In [16]:
from mnist_DTLock_AllXOR import dtLOCKED_AllXOR
y_test_pred_tree = []
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(dtLOCKED_AllXOR(test_samples,0))

y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(dtLOCKED_AllXOR(test_samples,0))

from sklearn.metrics import accuracy_score
print('Correct key:',accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

y_test_pred_tree = []
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(dtLOCKED_AllXOR(test_samples,1))

y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(dtLOCKED_AllXOR(test_samples,1))

from sklearn.metrics import accuracy_score
print('Incorrect key:',accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

Correct key: 0.8232 0.83208
Incorrect key: 0.0974 0.09692


## Logic Locking: XNOR key gates at each node

#### Run this cell twice.

In [28]:
%%capture cap --no-stderr
#https://towardsdatascience.com/scikit-learn-decision-trees-explained-803f3812290d
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def dtLOCKED_AllXNOR({},{}):".format('feature_set','key'))
    for i,pixel in enumerate(feature_names):
            print ("{}{}".format("  ", pixel+'='+'feature_set['+str(i)+']'))
    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            #XNOR Operation
            print ("{}if {} <= {} and not(({} <= {}) ^ key)==({} <= {})  :".format(indent, name, int(round(threshold,3)),name, int(round(threshold,3)),name, int(round(threshold,3))))  #convert the threshold to integer
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, np.argmax(tree_.value[node][0],axis=0)))

    recurse(0, 1)

cols = range(784)
features = ['pixel'+str(i) for i in cols]
class_names = [str(i) for i in dt_clf.classes_]
tree_to_code(dt_clf, features)

with open('mnist_DTLock_AllXNOR.py', 'w') as f:
    f.write(cap.stdout)

###### Verify the accuracy by using this dumped decision rules with correct/incorrect key.

In [19]:
from mnist_DTLock_AllXNOR import dtLOCKED_AllXNOR
y_test_pred_tree = []
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(dtLOCKED_AllXNOR(test_samples,1))

y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(dtLOCKED_AllXNOR(test_samples,1))

from sklearn.metrics import accuracy_score
print('Correct key:',accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

y_test_pred_tree = []
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(dtLOCKED_AllXNOR(test_samples,0))

y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(dtLOCKED_AllXNOR(test_samples,0))

from sklearn.metrics import accuracy_score
print('Incorrect key:',accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

Correct key: 0.8232 0.83208
Incorrect key: 0.0974 0.09692


## Logic Locking: Mix of XOR/XNOR key gates selected randomly at each node

###### Key generation

In [20]:
key = np.random.randint(0,2,counter_if)
print(key)

[0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0 1 1 1 0
 1 0 1 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1
 0 1 0 1 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0
 0 0 1 1 1 1 0 1 0 1 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1
 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0
 1 1 1 0 1 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 0 1
 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 0 0 1 1 0 1]


###### Selecting gates between XOR and XNOR based on key value. 0-XOR, 1-XNOR

#### Run this cell twice.

In [22]:
%%capture cap --no-stderr
#https://towardsdatascience.com/scikit-learn-decision-trees-explained-803f3812290d
from sklearn.tree import _tree
def tree_to_code(tree, feature_names, key, key_counter):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def dtLOCKED_XOR_XNOR({},{}):".format('feature_set','key'))
    for i,pixel in enumerate(feature_names):
            print ("{}{}".format("  ", pixel+'='+'feature_set['+str(i)+']'))
    def recurse(node, depth, key_counter):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            if key[key_counter] == 0:
                print ("{}if ({} <= {}) and (({} <= {}) ^ key[ ])==({} <= {})  :".format(indent, name, int(round(threshold,3)),name, int(round(threshold,3)), name, int(round(threshold,3))))  #convert the threshold to integer
            else:
                print ("{}if {} <= {} and not(({} <= {}) ^ key[ ])==({} <= {})  :".format(indent, name, int(round(threshold,3)),name, int(round(threshold,3)), name, int(round(threshold,3))))  #convert the threshold to integer
            key_counter = key_counter +1
            recurse(tree_.children_left[node], depth + 1, key_counter)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1, key_counter)
        else:
            print ("{}return {}".format(indent, np.argmax(tree_.value[node][0],axis=0)))

    recurse(0, 1, key_counter)

cols = range(784)
features = ['pixel'+str(i) for i in cols]
class_names = [str(i) for i in dt_clf.classes_]
key_counter=0
tree_to_code(dt_clf, features, key, key_counter)

with open('mnist_DTLock_XOR_XNOR.py', 'w') as f:
    f.write(cap.stdout)

###### Adding appropirate XOR/XNOR for key value inside key[_] field.

In [23]:
def search_key_lines(file_name):
    with open(file_name, 'r') as file:
        key_line_num = []
        file_content = file.readlines()
        for line_num,line in enumerate(file_content):
            if 'key[ ]' in line:
                key_line_num.append(line_num)        
        return(key_line_num)

def write_line_file(file_name, data_towrite, line_num):
    with open(file_name, 'r', encoding='utf-8') as file:
        file_content = file.readlines()
        file.close()
    file_content[line_num] = file_content[line_num].replace('key[ ]',f'key[{data_towrite}]')   
    with open(file_name, 'w', encoding='utf-8') as file:
        file.writelines(file_content)
        file.close()
    
def correct_keygate_xor_xnor(file_name, key):
    with open(file_name, 'r', encoding='utf-8') as file:
        file_content = file.readlines()
        file.close()
    for line_num,line in enumerate(file_content):
            if 'key[' in line:
                key_index = int(line.split('key[')[1].split(']')[0])
                if (key[key_index] == 1 and 'not' in line):
                    pass
                
                if (key[key_index] == 1 and 'not' not in line):
                    file_content[line_num] = ''.join(file_content[line_num].split('and')[0]+ 'and not' + file_content[line_num].split('and')[-1])
                
                if (key[key_index] == 0 and 'not' not in line):
                    pass
                
                if (key[key_index] == 0 and 'not' in line):
                    file_content[line_num] = file_content[line_num].replace('not','')
                    
    with open(file_name, 'w', encoding='utf-8') as file:
        file.writelines(file_content)
        file.close()
                    
def inspect_keygate_xor_xnor(file_name, key):
    with open(file_name, 'r', encoding='utf-8') as file:
        file_content = file.readlines()
        file.close()
    for line_num,line in enumerate(file_content):
            if 'key[' in line:
                key_index = int(line.split('key[')[1].split(']')[0])
                if (key[key_index] == 1 and 'not' in line) or (key[key_index] == 0 and 'not' not in line):
                    pass
                else:
                    pdb.set_trace()
                    
key_line_num = search_key_lines('mnist_DTLock_XOR_XNOR.py')
for i,line_num in enumerate(key_line_num):
    write_line_file('mnist_DTLock_XOR_XNOR.py', i, line_num)
correct_keygate_xor_xnor('mnist_DTLock_XOR_XNOR.py', key)

#Performing final inspection
inspect_keygate_xor_xnor('mnist_DTLock_XOR_XNOR.py', key)

###### Verify the accuracy by using this dumped decision rules. 

In [24]:
from mnist_DTLock_XOR_XNOR import dtLOCKED_XOR_XNOR
y_test_pred_tree = []
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(dtLOCKED_XOR_XNOR(test_samples,key))

y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(dtLOCKED_XOR_XNOR(test_samples,key))

from sklearn.metrics import accuracy_score
print('Correct key:',accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

Correct key: 0.8232 0.83208


###### Verify the accuracy by using this dumped decision rules with incorrect key.

In [25]:
y_test_pred_tree = []
#shuffle the key list to simulate incorrect key
np.random.shuffle(key)
for i,test_samples in enumerate(X_test):
    y_test_pred_tree.append(dtLOCKED_XOR_XNOR(test_samples,key))

np.random.shuffle(key)
y_train_pred_tree = []
for i,test_samples in enumerate(X_train):
    y_train_pred_tree.append(dtLOCKED_XOR_XNOR(test_samples,key))

from sklearn.metrics import accuracy_score
print('Incorrect key:',accuracy_score(y_test, y_test_pred_tree), accuracy_score(y_train, y_train_pred_tree))

Incorrect key: 0.1085 0.17932
