<a href="https://colab.research.google.com/github/saurabh1mohite/TDVR/blob/main/Data2Graph2Sents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import glob
import hashlib
import pandas as pd
import networkx as nx
from tqdm import tqdm
from joblib import Parallel, delayed
# from param_parser import parameter_parser
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
# !git clone https://github.com/benedekrozemberczki/graph2vec.git

In [None]:
class WeisfeilerLehmanMachine:
    """
    Weisfeiler Lehman feature extractor class.
    """
    def __init__(self, graph, features, iterations):
        """
        Initialization method which also executes feature extraction.
        :param graph: The networkx graph object.
        :param features: Feature hash table.
        :param iterations: Number of Weisfeiler Lehman iterations.
        """
        self.iterations = iterations
        self.graph = graph
        self.features = features
        self.nodes = self.graph.nodes()
        self.extracted_features = [str(v) for k, v in features.items()]
        self.do_recursions()

    def do_a_recursion(self):
        """
        The method does a single WL recursion.
        :return new_features: The hash table with extracted WL features.
        """
        new_features = {}
        for node in self.nodes:
            nebs = self.graph.neighbors(node)
            degs = [self.features[neb] for neb in nebs]
            features = [str(self.features[node])]+sorted([str(deg) for deg in degs])
            features = "_".join(features)
            hash_object = hashlib.md5(features.encode())
            hashing = hash_object.hexdigest()
            new_features[node] = hashing
        self.extracted_features = self.extracted_features + list(new_features.values())
        return new_features

    def do_recursions(self):
        """
        The method does a series of WL recursions.
        """
        for _ in range(self.iterations):
            self.features = self.do_a_recursion()

In [None]:
def path2name(path):
    base = os.path.basename(path)
    return os.path.splitext(base)[0]

def dataset_reader(path):
    """
    Function to read the graph and features from a json file.
    :param path: The path to the graph json.
    :return graph: The graph object.
    :return features: Features hash table.
    :return name: Name of the graph.
    """
    name = path2name(path)
    data = json.load(open(path))
    graph = nx.from_edgelist(data["edges"])

    if "features1" in data.keys():
        features = data["features"]
    else:
        features = dict(graph.degree(graph.nodes))

    # features = {int(k): v for k, v in features.items()}
    return graph, features, name

def feature_extractor(path, rounds):
    """
    Function to extract WL features from a graph.
    :param path: The path to the graph json.
    :param rounds: Number of WL iterations.
    :return doc: Document collection object.
    """
    graph, features, name = dataset_reader(path)
    machine = WeisfeilerLehmanMachine(graph, features, rounds)
    doc = TaggedDocument(words=machine.extracted_features, tags=["g_" + name])
    return doc

def save_embedding(output_path, model, files, dimensions):
    """
    Function to save the embedding.
    :param output_path: Path to the embedding csv.
    :param model: The embedding model object.
    :param files: The list of files.
    :param dimensions: The embedding dimension parameter.
    """
    out = []
    for f in files:
        identifier = path2name(f)
        out.append([identifier] + list(model.docvecs["g_"+identifier]))
    column_names = ["type"]+["x_"+str(dim) for dim in range(dimensions)]
    out = pd.DataFrame(out, columns=column_names)
    out = out.sort_values(["type"])
    out.to_csv(output_path, index=None)

def main(args):
    """
    Main function to read the graph list, extract features.
    Learn the embedding and save it.
    :param args: Object with the arguments.
    """
    graphs = glob.glob(os.path.join(args['input_path'], "*.json"))
    print("\nFeature extraction started.\n")
    # document_collections = Parallel(n_jobs=4)(delayed(feature_extractor)(g, 2) for g in tqdm(graphs))
    document_collections = [feature_extractor(g, 2) for g in tqdm(graphs)]
    print("\nOptimization started.\n")
    return document_collections

In [None]:
from zipfile import ZipFile
with ZipFile('/content/drive/MyDrive/NLP/Trance/JSON/data.jsonl.zip') as z:
    z.extractall('/content')

In [None]:
!mkdir dataset

In [None]:
# gray, red, blue, green, brown, purple, cyan, and yellow.

nodes = [
         'cube',
         'sphere',
         'cylinder',

         'small',
         'medium',
         'large',

         'glass',
         'rubber',
         'metal',

         'gray',
         'red',
         'blue',
         'green',
         'brown',
         'purple',
         'cyan',
         'yellow',
]
for i in range(10):
    nodes.insert(0, 'obj' + str(i))
for x in range(-40, 41):
    for y in range(-40, 41):
        nodes.append((x, y))

ids = [i for i in range(len(nodes))]
node2ids = dict(zip(nodes, ids))
id2nodes = dict(zip(ids, nodes))

In [None]:
from tqdm.notebook import tqdm
import json
import networkx as nx
ctr = 0
predicted_transformations = []
actual_transformations = []
factorials = {}

# 'transformation': [{'attr': 'color',
#    'obj_idx': 2,
#    'options': ['cyan'],
#    'val': 'cyan'},
#   {'attr': 'position',
#    'obj_idx': 2,
#    'options': [['behind', 2]],
#    'type': 'inner',
#    'val': ['behind', 2]},
#   {'attr': 'color', 'obj_idx': 4, 'options': ['brown'], 'val': 'brown'}]}


num_lines = sum(1 for line in open('/content/data.jsonl', 'r'))
stop_point = int(num_lines* .06)
transformations = {}
transformation_sents = []
with open('/content/data.jsonl', 'r') as f:
    for line in tqdm(f, total=stop_point):
        # print(line)
        entry = json.loads(line)
        data = {
            'init_state': entry['states'][0]['objects'],
            'init_image_file': entry['states'][0]['images']['Camera_Center'],
            'final_state': entry['states'][1]['objects'],
            'final_image_file': entry['states'][1]['images']['Camera_Center'],
            'transformation': entry['transformations']
        }
        # print(entry['states'][0]['objects'])
        # break
        transformation_edges = []
        sent = ''
        for transformation in data['transformation']:
            if transformation['attr'] == 'position':
                sent += 'move ' + str(transformation['obj_idx']) + ' towards ' + transformation['val'][0] + ' by ' + str(transformation['val'][1]) + ' steps '
            if transformation['attr'] == 'color':
                sent += 'change color of ' + str(transformation['obj_idx']) + ' to ' + transformation['val'] + ' '
            if transformation['attr'] == 'shape':
                sent += 'change shape of ' + str(transformation['obj_idx']) + ' to ' + transformation['val'] + ' '
            if transformation['attr'] == 'material':
                sent += 'change material of ' + str(transformation['obj_idx']) + ' to ' + transformation['val'] + ' '
            currObj = entry['states'][0]['objects'][transformation['obj_idx']]
            if transformation['attr'] == 'position':
                transformation_edges.append([node2ids['obj'+str(transformation['obj_idx'])], node2ids[tuple(currObj['position'])]])
            else:
                transformation_edges.append([node2ids['obj'+str(transformation['obj_idx'])], node2ids[currObj[transformation['attr']]]])
            if transformation['attr'] == 'position':
                val = transformation['val']
                if val[0] == 'behind':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (val[1]*10, 0))))
                    flag = 0
                elif val[0] == 'front':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (-val[1]*10, 0))))
                    flag = 1
                elif val[0] == 'right':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (0, val[1]*10))))
                    flag = 2
                elif val[0] == 'left':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (0, -val[1]*10))))
                    flag = 3
                elif val[0] == 'behind-left':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (val[1]*10, -val[1]*10))))
                    flag = 4
                elif val[0] == 'behind-right':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (val[1]*10, val[1]*10))))
                    flag = 5
                elif val[0] == 'front-left':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (-val[1]*10, -val[1]*10))))
                    flag = 6
                elif val[0] == 'front-right':
                    newPos = list(map(sum, zip(tuple(currObj['position']) , (-val[1]*10, val[1]*10))))
                    flag = 7
                if newPos[0] > 40:
                    newPos[0] = 40
                    print(newPos, flag)
                elif newPos[0] < -40:
                    print(newPos, flag)
                    newPos[0] = -40
                if newPos[1] > 40:
                    print(newPos, flag)
                    newPos[1] = 40
                elif newPos[1] < -40:
                    print(newPos, flag)
                    newPos[1] = -40
                # print(newPos)
                newPos = tuple(newPos)
                transformation_edges.append([node2ids['obj'+str(transformation['obj_idx'])], node2ids[newPos]])
            else:
                transformation_edges.append([node2ids['obj'+str(transformation['obj_idx'])], node2ids[transformation['val']]])
        transformation_sents.append(sent)
        init_edges = []
        for i in range(len(data['init_state'])):
            entry = data['init_state'][i]
            for k, v in entry.items():
                if k == 'position':
                    init_edges.append([node2ids['obj'+str(i)], node2ids[tuple(entry['position'])]])
                else:
                    init_edges.append([node2ids['obj'+str(i)], node2ids[entry[k]]])
        
        init_dict = {'edges': init_edges}

        # init_features = {}
        # for i in init_edges:
        #     if i[0] in init_features.keys():
        #         init_features[i[0]] += 1            
        #     else:
        #         init_features[i[0]] = 1
        #     if i[1] in init_features.keys():
        #         init_features[i[1]] += 1            
        #     else:
        #         init_features[i[1]] = 1

        # init_features = dict(zip([str(i) for i in init_features.keys()], [str(i) for i in init_features.values()]))

        final_edges = []
        for i in range(len(data['final_state'])):
            entry = data['final_state'][i]
            for k, v in entry.items():
                if k == 'position':
                    init_edges.append([node2ids['obj'+str(i)], node2ids[tuple(entry['position'])]])
                else:
                    final_edges.append([node2ids['obj'+str(i)], node2ids[entry[k]]])
        final_dict = {'edges': final_edges}

        # final_features = {}
        # for i in final_edges:
        #     if i[0] in final_features.keys():
        #         final_features[i[0]] += 1
        #     else:
        #         final_features[i[0]] = 1
        #     if i[1] in final_features.keys():
        #         final_features[i[1]] += 1
        #     else:
        #         final_features[i[1]] = 1
    
        # final_features = dict(zip([str(i) for i in final_features.keys()], [str(i) for i in final_features.values()]))
        
        # init_dict['features'] = init_features
        # final_dict['features'] = final_features
        transformation_dict = {'edges': transformation_edges}
        with open('/content/dataset/init-' + str(ctr) + '.json', 'w') as fp:
            json.dump(init_dict, fp)
        with open('/content/dataset/final-' + str(ctr) + '.json', 'w') as fp:
            json.dump(final_dict, fp)
        with open('/content/dataset/transformation-'+str(ctr)+'.json', 'w') as fp:
            json.dump(transformation_dict, fp)
        ctr += 1
        if ctr == stop_point:
            break
with open('/content/transformation_sents.json', 'w') as fp:
    json.dump(transformation_sents, fp)

  0%|          | 0/31800 [00:00<?, ?it/s]

[40, 55] 5
[13, 45] 7


In [None]:
transformation_sents[0]

'change color of 2 to cyan move 2 towards behind by 2 steps change color of 4 to brown '

In [None]:
args = {
    'input_path': '/content/dataset',
}
document_collections = main(args)


Feature extraction started.



  0%|          | 0/95400 [00:00<?, ?it/s]


Optimization started.



In [None]:
d = {}
temp_dataset = {}
for i in range(len(document_collections)):
    s = document_collections[i].tags[0].split('-')
    if int(s[1]) in temp_dataset.keys():
        temp_dataset[int(s[1])][s[0]] = ' '.join(document_collections[i].words)
    else:
        temp_dataset[int(s[1])] = {}
        temp_dataset[int(s[1])][s[0]] = ' '.join(document_collections[i].words)
        temp_dataset[int(s[1])]['transformation'] = transformation_sents[int(s[1])]

In [None]:
temp_dataset[0]

{'g_final': '4 2 5 4 2 4 1 6 4 1 4 4 4 3 4 2 4 2 4 1 2 4 4 1 4 9101c9722b4fe12f810a5e90fd7db826 0b053511e0180e19cb9f472b1726fb0b 44502c72f4d0fa22832333a7efa43831 dc179b89ab1d21903def8758075b43ed 0b053511e0180e19cb9f472b1726fb0b 0ca741c38743e64f764e31b46d6ba4f7 34b8a5e9772cd73daba26c68a209127f 6a6b0a90457388b86ba4cab035ba8540 6757b1787ca8698d4378a227021bcd47 34b8a5e9772cd73daba26c68a209127f dc179b89ab1d21903def8758075b43ed dc179b89ab1d21903def8758075b43ed f44491e89693d240390c8e681c5822c0 1e736e30f50dfe42581bdf1c20181487 8c89d5843c82b9c4046d441b18d8ecea 0b053511e0180e19cb9f472b1726fb0b c8200661feb7927861f5a7f5f6b51e1d 0b053511e0180e19cb9f472b1726fb0b 267b63deca7e543f4bf71d06d36f3a9f 34b8a5e9772cd73daba26c68a209127f 0b053511e0180e19cb9f472b1726fb0b 8c89d5843c82b9c4046d441b18d8ecea e23527305d5ae15eb107f46da3ff53a1 34b8a5e9772cd73daba26c68a209127f 8c89d5843c82b9c4046d441b18d8ecea 9c711194feab951b18a086aa71711cf3 19dbf3ba32064563fd077b1a75a7a4bb 487dd6dd4b655c8d4180a83db48efbc1 73028695b7df9

In [None]:
temp1 = []
for k, v in temp_dataset.items():
    temp1.append(v)
temp1[0]

{'g_final': '4 1 4 4 3 4 2 3 5 3 4 1 4 4 2 1 4 3 4 3 4 4 4 4 1 38a8cf581986627f062b87819e90938f 34b8a5e9772cd73daba26c68a209127f dc179b89ab1d21903def8758075b43ed dc179b89ab1d21903def8758075b43ed 1e736e30f50dfe42581bdf1c20181487 3945568d57088eeb68923b91b5f4fdfc 0b053511e0180e19cb9f472b1726fb0b 1e736e30f50dfe42581bdf1c20181487 44502c72f4d0fa22832333a7efa43831 1e736e30f50dfe42581bdf1c20181487 be3f508cb041c6f8d1868798c1fa5622 34b8a5e9772cd73daba26c68a209127f dc179b89ab1d21903def8758075b43ed 6757b1787ca8698d4378a227021bcd47 0b053511e0180e19cb9f472b1726fb0b 34b8a5e9772cd73daba26c68a209127f 833b96ad41c4b55dc1b0da7e2b5fada2 1e736e30f50dfe42581bdf1c20181487 89af402e42075dcd347caa4f937e31b7 1e736e30f50dfe42581bdf1c20181487 3945568d57088eeb68923b91b5f4fdfc 26798bf267ec9119935f4f6037db99ff cae11adfc356c3403a4803f6ab37a9d6 38a8cf581986627f062b87819e90938f 34b8a5e9772cd73daba26c68a209127f 26bbb4552413b2d26862563cf89ce7a6 27ef3b7d79fcd676065670f21b09e65d 1b381846845889177c690a5c8ab71d83 db374bef6f449

In [None]:
df = pd.read_json(json.dumps(temp1), orient='records')

In [None]:
df.to_csv('/content/drive/MyDrive/NLP/Trance/CSV/dataset.txt', header=False, index=False)

In [None]:
maxLen = 0
for entry in tqdm(temp1):
    for k, v in entry.items():
        if len(v) > maxLen:
            maxLen = len(v.split(' '))
print(maxLen)

  0%|          | 0/31800 [00:00<?, ?it/s]

75
