## Imports & helper functions

In [None]:
%pip install --upgrade openai
%pip install --upgrade tqdm
%pip install --upgrade ipywidgets

In [None]:
import json

def read_json_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)
    return data


def parse_json(json_string):
    json_dict = json.loads(json_string)
    return json_dict


def prettify_json(obj):
    pretty_json = json.dumps(obj, indent=2)
    return pretty_json


def write_to_json_file(obj, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(obj, json_file, indent=2)


In [None]:
import configparser

def read_ini_file(file_path):
    config = configparser.ConfigParser()
    config.read(file_path)
    ini_dict = {section: dict(config.items(section))
                for section in config.sections()}
    return ini_dict


In [None]:
from openai import OpenAI

In [None]:
import re


def remove_java_comments(java_source):
    # Regular expression to match Java comments (both single-line and multi-line)
    pattern = r"(//.*?$)|(/\*.*?\*/)"

    # Remove comments using the regular expression
    java_source_without_comments = re.sub(
        pattern, "", java_source, flags=re.MULTILINE | re.DOTALL)

    return java_source_without_comments.strip()


In [None]:
def sentence(s):
    '''
    Capitalize the first letter of a string `s` and ensures that the string 
    ends with a period (if it's not already a sentence-ending punctuation).
    '''
    t = s.strip()
    if t[-1] in '.?!…~–—':
        return f'{t[0].upper()}{t[1:]}'
    else:
        return f'{t[0].upper()}{t[1:]}.'

In [None]:
def transform_graph(graph):
    nodes = { node['data']['id']: node['data'] for node in graph['elements']['nodes'] }
    edges = {}
    for edge in graph['elements']['edges']:
        if 'label' in edge['data']:
            label = edge['data']['label']
        else:
            label = ','.join(edge['data']['labels'])
            edge['data']['label'] = label

        if label not in edges:
            edges[label] = []
        edges[label].append(edge['data'])
    return (nodes, edges)


In [None]:
def invert(edgeList):
    prefix = "inv_"
    invertedEdges = []
    for edge in edgeList:
        invertedEdge = {
            'source': edge['target'],
            'target': edge['source'],
            'label': prefix + edge.get('label', ''),
            **{key: value for key, value in edge.items() if key not in ['source', 'target', 'label']}
        }
        invertedEdges.append(invertedEdge)
    return invertedEdges

def find_paths(edgeList1, edgeList2):
    source_mapping = {}
    for edge in edgeList1:
        source_mapping[edge['target']] = edge['source']

    paths = set()
    for edge in edgeList2:
        if edge['source'] in source_mapping:
            source1 = source_mapping[edge['source']]
            path = [source1, edge['source'], edge['target']]
            paths.add(tuple(path))

    return paths

## Parameters

In [None]:
# If True: do not call the API, just print the prompts
only_print_prompt = False

In [None]:
config = read_ini_file('config.ini')
project_name = config['project']['name']
project_desc = config['project']['desc']
ifile = config['project']['ifile']

## Read graph file

To access knowledge graph extracted using javapers.

In [None]:
graph = read_json_file(ifile)
nodes,edges = transform_graph(read_json_file(ifile))

## Connect to openai

In [None]:
cliet_args = dict()

if 'apikey' in config['openai']:
    cliet_args['api_key'] = config['openai']['apikey']
if 'apibase' in config['openai']:
    cliet_args['base_url'] = config['openai']['apibase']
if 'model' in config['openai']:
    model = config['openai']['model']
else:
    model = "gpt-3.5-turbo"

(list(cliet_args.keys()), model)

In [None]:
client = OpenAI(**cliet_args)
client.base_url

In [None]:
# test the LLM server---create a completion
completion = client.chat.completions.create(
    model=model,
    messages=[{"role":"user","content":"Hello, world!"}],
    temperature=0
)
# print the completion
print(completion.choices[0].message.content)

## Elements to be inspected

In [None]:
methods = sorted(find_paths(edges['contains'], edges['hasScript']))
len(methods)

In [None]:
classes = sorted({(pkg,clz) for pkg,clz,_ in methods})
len(classes)

In [None]:
packages = sorted({pkg for pkg,_ in classes})
len(packages)

In [None]:
hierarchy = {
    pkg_id: { 
        cls_id: [
            met_id for _,c,met_id in methods if c == cls_id
        ] for p,cls_id in classes if p == pkg_id
    } for pkg_id in packages
}

len(hierarchy)

In [None]:
import time

timestr = time.strftime("%Y%m%d-%H%M%S")
timestr

## Ask LLM to summarize methods, classes, and packages

In [None]:
method_prompt_template = '''This is method `{op_name}` of {struct_kind} `{struct_name}`:

```java
{op_src}
```

(Keep in mind that any code comments may be incorrect or outdated.)

Explain the above method on the following aspects:

{{ description: "Describe the functionality of the method in one sentence.",
  parameters: [ {{ name:..., type:..., description:... }}, ... ],
  returns: {{ type:..., description: ... }}, // In case of a constructor, consider the constructed class as the return type.
  reason: "Explain, in one sentence, the reason why the method is provided or the design rationale of the method.",
  howToUse: "Describe the usage or the expected set-up of using the method in less than 3 sentences."
  howItWorks: "Describe the implementation details of the method in less than 5 sentences."
  assertions: {{ preConditions: ["pre-conditions of the method", ...], postConditions: ["pre-conditions of the method", ...] }}
}}

Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values.'''


In [None]:
class_prompt_template = '''A Java {struct_type} `{struct_name}` specializes the following class(es) or interface(s):

{ancestors}

This {struct_type} contains the following field(s) and method(s):

Fields:

{fields}

Methods:

{methods}

Describe the responsibilities of this {struct_type} in one sentence. Frame the class in one of the following role stereotypes:

- **Information Holder** is responsible for knowing facts and providing information to other objects. In Java, it can be a POJO or a Java Bean.

- **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods.

- **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort.

- **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process.

- **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses.

- **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users.

- **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc.

- **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters or bridges are internal interfacers.

And in one of the following architectural layer:

- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers.

- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers.

- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers.

- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers.

Answer in well-formatted JSON {{ "roleStereotype": ..., "layer": ..., "description": "This {struct_type} is responsible for ..." }}'''


In [None]:
package_prompt_template = '''Given a Java package `{pkg_name}` containing the following classes:

{classes}

Describe the purpose of this package in one sentence. Frame the package in one of the following architectural layer:

- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers.

- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers.

- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers.

- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers.

Answer in well-formatted JSON {{ "layer": ..., "description": "The purpose of this package is to ..." }}'''


In [None]:
def lower1(s):
    if not s:
        return s
    return s[0].lower() + s[1:]

In [None]:
def describe(node):
	keys = 'description,reason,howToUse,howItWorks,assertions,roleStereotype,layer'.split(',')
	desc = ''
	for key in keys:
		if key in node['properties']:
			desc += f"**{key}**: {str(node['properties'][key])}. "
	return desc

In [None]:
import os
from tqdm.notebook import tqdm

current_pkg = None
current_cls = None

with open(f'arcana-{timestr}.log', 'a', encoding="utf-8") as file:
    
	try:
		for pkg_id,pkg_data in tqdm(hierarchy.items(), desc="Processing packages", position=0):
			file.write('# ' +  pkg_id + "\n")
			package = nodes[pkg_id]

			for cls_id,cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=1, leave=False):    
				file.write('\t* ' + cls_id + "\n")
				clasz = nodes[cls_id]

				class_name = clasz['properties']['qualifiedName']
				class_kind = clasz['properties']['kind']
				if class_kind == 'enumeration':
					class_kind = 'enum'
				elif class_kind == 'abstract':
					class_kind = 'abstract class'

				for met_id in tqdm(cls_data, desc='Processing methods', position=2, leave=False):

					if not 'description' in nodes[met_id]['properties'] \
							or not nodes[met_id]['properties']['description']:

						file.write('\t\t- ' + met_id + "\n")

						method = nodes[met_id]

						method_name = method['properties']['simpleName']
						method_src = method['properties']['sourceText']

						prompt = method_prompt_template.format(
							op_name= method_name, 
							struct_kind= class_kind, 
							struct_name= class_name, 
							op_src= method_src)
						file.write('\t\t\t' + prompt.replace('\n', '\n\t\t\t') + "\n")
						file.write("\n")
						if only_print_prompt:
							pass
						else:
							response = None
							try:
								response = client.chat.completions.create(
									model=model,
									response_format= { "type": "json_object" },
									messages=[
										{"role": "user","content": prompt}, 
										# {"role": "assistant","content": '{ "What": "'}
									],
									max_tokens=1024, # stop=[". "],
									temperature=0)
								description = response.choices[0].message.content
							except:
								description = '{}'
								print(response)
								
							try:
								description = json.loads(description)
							except:
								description = dict()
							
							file.write('\t\t\t'+ prettify_json(description).replace('\n', '\n\t\t\t') + "\n")
							file.write("\n")
							for key in description:
								method['properties'][lower1(key)] = description[key]
					
					if os.path.exists('stop'):
						raise StopIteration

				ancestors = {edge['target'] for edge in edges['specializes'] if edge['source'] == cls_id}
				fields = {edge['target'] for edge in edges['hasVariable'] if edge['source'] == cls_id}
				fields = [remove_java_comments(nodes[field]['properties']['sourceText']) for field in fields]

				prompt = class_prompt_template.format(
						struct_type=class_kind, 
						struct_name=class_name, 
						ancestors="\n".join([f"- `{ancestor}`" for ancestor in ancestors]) if ancestors else "(none)",
						fields="\n".join([f"- `{field}`" for field in fields]) if fields else "(none)",
						methods="\n".join([f"- `{nodes[met_id]['properties']['simpleName']}`: {describe(nodes[met_id])}" 
								for met_id in cls_data])) if cls_data else "(none)"

				file.write('\t\t' + prompt.replace('\n', '\n\t\t') + "\n")
				file.write("\n")
				if only_print_prompt:
					pass
				else:
					response = None
					try:
						# file.write(prompt + "\n")
						response = client.chat.completions.create(
							model=model,
							response_format= { "type": "json_object" },
							messages=[
								{"role": "user", "content": prompt}, 
								# {"role": "assistant", "content": "{"}
							],
							max_tokens=1024, 
							# stop=[". "],
							temperature=0)
						description = response.choices[0].message.content
					except:
						description = "{}"
						file.write(str(response) + "\n")
						
					try:
						description = json.loads(description)
					except:
						description = dict()
					file.write('\t\t'+ prettify_json(description).replace('\n', '\n\t\t') + "\n")
					file.write("\n")
					for key in description:
						clasz['properties'][lower1(key)] = description[key]
						
				file.flush()
				if os.path.exists('stop'):
					raise StopIteration


			prompt = package_prompt_template.format(
				pkg_name= package['properties']['qualifiedName'],
				classes= "\n".join([f"- {nodes[cls_id]['properties']['kind']} `{nodes[cls_id]['properties']['qualifiedName']}`: {describe(nodes[cls_id])}" 
								for cls_id, _ in pkg_data.items()])
			)

			file.write('\t' + prompt.replace('\n', '\n\t') + "\n")
			file.write("\n")
			if only_print_prompt:
				pass
			else:
				response = None
				try:
					response = client.chat.completions.create(
						model=model,
						response_format= { "type": "json_object" },
						messages=[
							{"role": "user", "content": prompt}, 
							# {"role": "assistant", "content": f"The package `{package_name}` is a package that"}
						],
						max_tokens=1024, 
						# stop=[". "],
						temperature=0)
					description = response.choices[0].message.content
				except:
					description = '{}'
					file.write(str(response) + "\n")
					
				try:
					description = json.loads(description)
				except:
					description = dict()
				file.write('\t' + prettify_json(description).replace('\n', '\n\t') + "\n")
				file.write("\n")
				for key in description:
					package['properties'][lower1(key)] = description[key]
			if os.path.exists('stop'):
				raise StopIteration

	except StopIteration:
		pass
   
print(prettify_json(nodes))


In [None]:
# from statistics import median, quantiles

# num_classes_per_pkg = [len(pkg_desc['classes']) for _, pkg_desc in hierarchy.items()]
# num_methods_per_class =  [len(class_desc['methods']) for _, pkg_desc in hierarchy.items() for _, class_desc in pkg_desc['classes'].items()]
# len(hierarchy), sum(num_classes_per_pkg), min(num_classes_per_pkg), max(num_classes_per_pkg), median(num_classes_per_pkg), quantiles(num_classes_per_pkg, n=4), sum(
#     num_methods_per_class), min(num_methods_per_class), max(num_methods_per_class), median(num_methods_per_class), quantiles(num_methods_per_class, n=4)


## Which methods/classes/packages could not be summarized by the LLM?

In [None]:
# method_no_desc = [
#     (pkg_name, class_name, method_name)
#     for pkg_name, pkg_desc in hierarchy.items()
#     for class_name, class_desc in pkg_desc['classes'].items()
#     for method_name in class_desc['methods']
#     if class_desc['methods'][method_name]['description'] == "(no description)"
# ]

# print(prettify_json(method_no_desc))


In [None]:
# class_no_desc = [
#     (pkg_name, class_name)
#     for pkg_name, pkg_desc in hierarchy.items()
#     for class_name in pkg_desc['classes']
#     if pkg_desc['classes'][class_name]['description'] == "(no description)"
# ]

# print(prettify_json(class_no_desc))


In [None]:
# package_no_desc = [
#     pkg_name
#     for pkg_name in hierarchy
#     if hierarchy[pkg_name]['description'] == "(no description)"
# ]

# print(prettify_json(package_no_desc))


## Add the summaries back to the graph nodes

In [None]:
nodes

In [None]:
graph['elements']['nodes'] = [{'data':node_data} for node_data in nodes.values()]

In [None]:
write_to_json_file(graph,f'{project_name}-with-summaries.json')