# Programming Assignment 1
## model transformers:
### 4. Semistructured to Property Graph
Each of the six groups should choose one and implement it in Python with a README file and some documentation. The implementation can be delivered in one of three ways: a command line utility, a Python package, or a REST API. A model transformer will take as input (a) a data file (e.g., CSV) of a specific model, and (b) a mapping configuration (in class we have seen several examples -- see the Model Transformation slides) and output a transformed data file. The developing group should also produce a sample input data file and its corresponding output data file. If any implementation uses an existing library or has some dependency (e.g., it uses Java), it should be mentioned in the README file so that the users can configure their environment accordingly. 

In [1]:
import argparse
import json

def load_json_data(input_file):
    try:
        with open(input_file, 'r') as data_json:
            source_data = json.load(data_json)
        return source_data
    except FileNotFoundError:
        print(f"Error: The file '{input_file}' was not found.")
        return None

# Create an argument parser
parser = argparse.ArgumentParser(description="Load JSON data from a file")

# Add arguments for input file and mapping configuration file
parser.add_argument("input_file", help="Path to the input JSON data file")
parser.add_argument("map_cfg_file", help="Path to the mapping configuration file")

# Parse the command-line arguments
# args = parser.parse_args()

# Specify the input file path here
#input_file = './people2.json'  # Replace with the actual file path
#map_cfg_file = './people_map_cfg.json'

#input_file = './books.json'  # Replace with the actual file path
#map_cfg_file = './books_map_cfg.json'
input_file = './movies.json'  # Replace with the actual file path
map_cfg_file = './movies_map_cfg.json'

# Load data from the specified input file
source_data = load_json_data(input_file)
mapping_config = load_json_data(map_cfg_file)

# if source_data and mapping_config:
# Your data processing code goes here
# print(source_data)
# print(mapping_config)

# Initialize the property graph
property_graph = {
    "graph": {
        "nodes": [],
        "edges": []
    }
}

# add nodes to the property graph


for i in range(len(mapping_config['schema-map']['nodes'])):
    for element in source_data[mapping_config['schema-map']['nodes'][i]['label']]:
        node = {
            "node": {
                "id": element[mapping_config['schema-map']['nodes'][i]['node-id']],
                "label": mapping_config['schema-map']['nodes'][i]['label'],
                "properties": {}  # mapping_config['schema-map']['nodes'][i]['node-properties']
            }
        }
    
        # Map properties to the node
        for property_name, property_mapping in mapping_config['schema-map']['nodes'][i]['node-properties'].items():
            # Assuming property_mapping is a dictionary
            data_value = property_mapping.get("data_value", None)
            data_type = property_mapping.get("data_type", None)
    
            # Now, you can split the data_value (assuming it's a string)
            if data_value is not None:
                for key in data_value.split('.'):
                    property_value = element.get(key, None)
                    if property_value is None:
                        break
    
            if property_value is not None:
                node["node"]["properties"][property_name] = {
                    "datatype": data_type,  # comment: modify this based on the actual data types
                    "data value": property_value
                }
    
        property_graph["graph"]["nodes"].append(node)
    '''
    # source data and add edges to the property graph
for j in range(len(mapping_config['schema-map']['edges'])):
    for element in source_data[mapping_config['schema-map']['edges'][j]['_source_type']]:
        source_id = element[mapping_config['schema-map']['edges'][j]['_source']]
        target = element.get(mapping_config['schema-map']['edges'][j]['edge-type'], [])

        for target_id in target:
            edge = {
                "edge": {
                    "id": len(property_graph["graph"]["edges"]) + 1,
                    "relationship": mapping_config['schema-map']['edges'][j]['relationship'],
                    "from_node_id": source_id,
                    "to_node_id": target_id,
                    "properties": {}
                }
            }

            property_graph["graph"]["edges"].append(edge)
    
# Save the property graph in JSON format
with open("property_graph.json", "w") as output_file:
    json.dump(property_graph, output_file, indent=4)

property_graph
'''


In [2]:
property_graph

{'graph': {'nodes': [{'node': {'id': 'The Murder of Nicole Brown Simpson',
     'label': 'Movies',
     'properties': {'Extract': {'datatype': 'string',
       'data value': "The Murder of Nicole Brown Simpson is a 2019 American crime horror film directed by Daniel Farrands. The film is loosely based on the murder of Nicole Brown Simpson, presenting a version of events in which Brown Simpson is murdered by serial killer Glen Edward Rogers, and not by O. J. Simpson, her ex-husband and the primary suspect in the case. Though Mena Suvari's performance as Nicole Brown was praised, the film was panned by critics."},
      'thumbnail_url': {'datatype': 'string',
       'data value': 'https://upload.wikimedia.org/wikipedia/en/e/ed/The_Murder_of_Nicole_Brown_Simpson_poster.jpg'},
      'thumbnail_width': {'datatype': 'string', 'data value': 263},
      'thumbnail_height': {'datatype': 'string', 'data value': 380},
      'Cast': {'datatype': 'list',
       'data value': ['Mena Suvari', 'Nick St

In [3]:
mapping_config

{'schema-map': {'nodes': [{'node-type': '$.Movies',
    'label': 'Movies',
    'node-id': 'title',
    'node-properties': {'Extract': {'data_type': 'string',
      'data_value': 'extract'},
     'thumbnail_url': {'data_type': 'string', 'data_value': 'thumbnail'},
     'thumbnail_width': {'data_type': 'string',
      'data_value': 'thumbnail_width'},
     'thumbnail_height': {'data_type': 'string',
      'data_value': 'thumbnail_height'},
     'Generes': {'data_type': 'list', 'data_value': 'generes'},
     'Cast': {'data_type': 'list', 'data_value': 'cast'}}},
   {'node-type': '$.People',
    'label': 'People',
    'node-id': 'Name',
    'node-properties': {'Age': {'data_type': 'numeric', 'data_value': 'Age'},
     'Gender': {'data_type': 'string', 'data_value': 'Gender'},
     'friends': {'data_type': 'list', 'data_value': 'friends'}}}],
  'edges': [{'edge-type': 'cast',
    'relationship': 'PLAYED_IN',
    'direction': 'Out',
    'edge-id': '',
    'edge-properties': '',
    '_source'

In [4]:
for edge in mapping_config['schema-map']['edges']:
    print(edge)
    src_type = edge['_source_type']
    trgt_type = edge['_target_type']
    src_match = edge['_source']
    trgt_match = edge['_target']
    print(src_type,trgt_type,src_match, trgt_match)
    src_nodes = []
    target_nodes = []
    for node in property_graph['graph']['nodes']:
        if node['node']['label'] == src_type:
            src_nodes.append(node)
        if node['node']['label'] == trgt_type:
            target_nodes.append(node)
    print("Src Nodes")
    print(src_nodes)
    print("Trgt Nodes")
    print(target_nodes)
    edge_id = 0
    for s_node in src_nodes:
        for t_node in target_nodes:
            if t_node['node']['properties'][trgt_match]['datatype'] == 'list':
                print(s_node['node'][src_match],t_node['node']['properties'][trgt_match]['data value'])
                if s_node['node'][src_match] in t_node['node']['properties'][trgt_match]['data value']: 
                    edge_node = {
                        "edge": {
                            "id": len(property_graph["graph"]["edges"]) + 1,
                            #"relationship": mapping_config['schema-map']['edges'][edge_node]['relationship'],
                            "from_node_id": s_node['node']['id'],
                            "to_node_id": t_node['node']['id'],
                            "properties": {}
                        }
                    }
                    property_graph["graph"]["edges"].append(edge_node)
                
            elif s_node['node'][src_match] == t_node['node']['properties'][trgt_match]['data value']:
                edge_node = {
                        "edge": {
                            "id": len(property_graph["graph"]["edges"]) + 1,
                            #"relationship": mapping_config['schema-map']['edges'][edge_node]['relationship'],
                            "from_node_id": s_node['node']['id'],
                            "to_node_id": t_node['node']['id'],
                            "properties": {}
                        }
                
                }
                property_graph["graph"]["edges"].append(edge_node)
               
    

{'edge-type': 'cast', 'relationship': 'PLAYED_IN', 'direction': 'Out', 'edge-id': '', 'edge-properties': '', '_source': 'id', '_source_type': 'People', '_target': 'Cast', '_target_type': 'Movies'}
People Movies id Cast
Src Nodes
[{'node': {'id': 'Mena Suvari', 'label': 'People', 'properties': {'Age': {'datatype': 'numeric', 'data value': 20}, 'Gender': {'datatype': 'string', 'data value': 'Male'}, 'friends': {'datatype': 'list', 'data value': [2, 3]}}}}, {'node': {'id': 'Terry Crews', 'label': 'People', 'properties': {'Age': {'datatype': 'numeric', 'data value': 25}, 'Gender': {'datatype': 'string', 'data value': 'Female'}, 'friends': {'datatype': 'list', 'data value': [1]}}}}, {'node': {'id': 'Ludacris', 'label': 'People', 'properties': {'Age': {'datatype': 'numeric', 'data value': 22}, 'Gender': {'datatype': 'string', 'data value': 'Male'}, 'friends': {'datatype': 'list', 'data value': [1, 4]}}}}, {'node': {'id': 'Joseph Julian Soria', 'label': 'People', 'properties': {'Age': {'datat

In [5]:
print(node)

{'node': {'id': 'Joseph Julian Soria', 'label': 'People', 'properties': {'Age': {'datatype': 'numeric', 'data value': 52}, 'Gender': {'datatype': 'string', 'data value': 'Male'}, 'friends': {'datatype': 'list', 'data value': [3]}}}}


In [6]:
property_graph['graph']

{'nodes': [{'node': {'id': 'The Murder of Nicole Brown Simpson',
    'label': 'Movies',
    'properties': {'Extract': {'datatype': 'string',
      'data value': "The Murder of Nicole Brown Simpson is a 2019 American crime horror film directed by Daniel Farrands. The film is loosely based on the murder of Nicole Brown Simpson, presenting a version of events in which Brown Simpson is murdered by serial killer Glen Edward Rogers, and not by O. J. Simpson, her ex-husband and the primary suspect in the case. Though Mena Suvari's performance as Nicole Brown was praised, the film was panned by critics."},
     'thumbnail_url': {'datatype': 'string',
      'data value': 'https://upload.wikimedia.org/wikipedia/en/e/ed/The_Murder_of_Nicole_Brown_Simpson_poster.jpg'},
     'thumbnail_width': {'datatype': 'string', 'data value': 263},
     'thumbnail_height': {'datatype': 'string', 'data value': 380},
     'Cast': {'datatype': 'list',
      'data value': ['Mena Suvari', 'Nick Stahl', 'Taryn Mannin

In [None]:
import json
import sys

def _load_json_data(input_file):
    try:
        with open(input_file, 'r') as input_json:
            json_data = json.load(input_json)
        return json_data
    except Exception as e:
        print("Input file Error! {}".format(e))
        return sys.exit(2)

def semistruct_to_pgraph(in_file, mapcfg_file, out_file='./pgraph.json', debug=False):

    if debug:
        print('input file = {}, mapping configuration file = {}, output file = {}'.format(in_file, mapcfg_file, out_file)

    # Read file input file containing JSON format semi-structured data and mapping configuration file in JSON format
    source_data = _load_json_data(in_file)
    mapping_config = _load_json_data(mapcfg_file)

    if debug:
        print("input data:")
        print(source_data)
        print("mapping configuration:")
        print(mapping_config)

    # Initialize the property graph
    property_graph = {
        "graph": {
            "nodes": [],
            "edges": []
        }
    }

    # Add nodes to the property graph
    for i in range(len(mapping_config['schema-map']['nodes'])):
        for element in source_data[mapping_config['schema-map']['nodes'][i]['label']]:
            node = {
                "node": {
                    "id": element[mapping_config['schema-map']['nodes'][i]['node-id']],
                    "label": mapping_config['schema-map']['nodes'][i]['label'],
                    "properties": {}  # mapping_config['schema-map']['nodes'][i]['node-properties']
                }
            }

            # Map properties to the node
            for property_name, property_mapping in mapping_config['schema-map']['nodes'][i]['node-properties'].items():
                # Assuming property_mapping is a dictionary
                data_value = property_mapping.get("data_value", None)
                data_type = property_mapping.get("data_type", None)

                # Now, you can split the data_value (assuming it's a string)
                if data_value is not None:
                    for key in data_value.split('.'):
                        property_value = element.get(key, None)
                        if property_value is None:
                            break

                if property_value is not None:
                    node["node"]["properties"][property_name] = {
                        "datatype": data_type,  # comment: modify this based on the actual data types
                        "data value": property_value
                    }

            property_graph["graph"]["nodes"].append(node)

    # Create edges
    # Inefficient currently - goes through source and target node lists and matches individually
    for edge in mapping_config['schema-map']['edges']:
        print(edge)
        src_type = edge['_source_type']
        trgt_type = edge['_target_type']
        src_match = edge['_source']
        trgt_match = edge['_target']
        print(src_type, trgt_type, src_match, trgt_match)
        src_nodes = []
        target_nodes = []
        for node in property_graph['graph']['nodes']:
            if node['node']['label'] == src_type:
                src_nodes.append(node)
            if node['node']['label'] == trgt_type:
                target_nodes.append(node)
        print("Src Nodes")
        print(src_nodes)
        print("Trgt Nodes")
        print(target_nodes)
        edge_id = 0
        for s_node in src_nodes:
            for t_node in target_nodes:
                if t_node['node']['properties'][trgt_match]['datatype'] == 'list':
                    print(s_node['node'][src_match], t_node['node']['properties'][trgt_match]['data value'])
                    if s_node['node'][src_match] in t_node['node']['properties'][trgt_match]['data value']:
                        edge_node = {
                            "edge": {
                                "id": len(property_graph["graph"]["edges"]) + 1,
                                # "relationship": mapping_config['schema-map']['edges'][edge_node]['relationship'],
                                "from_node_id": s_node['node']['id'],
                                "to_node_id": t_node['node']['id'],
                                "properties": {}
                            }
                        }
                        property_graph["graph"]["edges"].append(edge_node)

                elif s_node['node'][src_match] == t_node['node']['properties'][trgt_match]['data value']:
                    edge_node = {
                        "edge": {
                            "id": len(property_graph["graph"]["edges"]) + 1,
                            # "relationship": mapping_config['schema-map']['edges'][edge_node]['relationship'],
                            "from_node_id": s_node['node']['id'],
                            "to_node_id": t_node['node']['id'],
                            "properties": {}
                        }
                    }

                property_graph["graph"]["edges"].append(edge_node)

    # Save the property graph in JSON format
    if debug:
        print("generated property graph:")
        print(property_graph)
    with open(out_file, "w") as output_file:
        json.dump(property_graph, output_file, indent=4)

    return
