## Imports & helper functions

In [24]:
# %pip install --upgrade openai
# %pip install --upgrade tqdm
# %pip install --upgrade ipywidgets

In [25]:
import json
from tqdm.notebook import tqdm

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data


def parse_json(json_string):
    json_dict = json.loads(json_string)
    return json_dict


def prettify_json(obj):
    pretty_json = json.dumps(obj, indent=2)
    return pretty_json


def write_to_json_file(obj, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(obj, json_file, indent=2)


In [26]:
import configparser

def read_ini_file(file_path):
    config = configparser.ConfigParser()
    config.read(file_path)
    ini_dict = {section: dict(config.items(section))
                for section in config.sections()}
    return ini_dict


In [27]:
from openai import OpenAI

In [28]:
import re


def remove_java_comments(java_source):
    # Regular expression to match Java comments (both single-line and multi-line)
    pattern = r"(//.*?$)|(/\*.*?\*/)"

    # Remove comments using the regular expression
    java_source_without_comments = re.sub(
        pattern, "", java_source, flags=re.MULTILINE | re.DOTALL)

    return java_source_without_comments.strip()


In [29]:
def sentence(s):
    '''
    Capitalize the first letter of a string `s` and ensures that the string 
    ends with a period (if it's not already a sentence-ending punctuation).
    '''
    t = s.strip()
    if t[-1] in '.?!…~–—':
        return f'{t[0].upper()}{t[1:]}'
    else:
        return f'{t[0].upper()}{t[1:]}.'

In [30]:
def transform_graph(graph):
	nodes = { node['data']['id']: node['data'] for node in graph['elements']['nodes'] }
	edges = {}
	for edge in graph['elements']['edges']:
		if 'label' in edge['data']:
			label = edge['data']['label']
		else:
			label = ','.join(edge['data']['labels'])
			edge['data']['label'] = label
		
		if label not in edges:
			edges[label] = []
		edges[label].append(edge['data'])
	return (nodes, edges)
	

In [31]:
def invert(edgeList):
    prefix = "inv_"
    invertedEdges = []
    for edge in edgeList:
        invertedEdge = {
            'source': edge['target'],
            'target': edge['source'],
            'label': prefix + edge.get('label', ''),
            **{key: value for key, value in edge.items() if key not in ['source', 'target', 'label']}
        }
        invertedEdges.append(invertedEdge)
    return invertedEdges

def find_paths(edgeList1, edgeList2):
    source_mapping = {}
    for edge in edgeList1:
        source_mapping[edge['target']] = edge['source']

    paths = set()
    for edge in edgeList2:
        if edge['source'] in source_mapping:
            source1 = source_mapping[edge['source']]
            path = [source1, edge['source'], edge['target']]
            paths.add(tuple(path))

    return paths

## Parameters

In [32]:
# If True: do not call the API, just print the prompts
only_print_prompt = False

In [33]:
config = read_ini_file('config.ini')
project_name = config['project']['name']
project_name
ifile = config['project']['ifile']

## Read graph file

To access knowledge graph extracted using javapers.

In [34]:
graph = read_json_file(ifile)
nodes,edges = transform_graph(read_json_file(ifile))

## Connect to openai

In [35]:
cliet_args = dict()

if 'apikey' in config['openai']:
    cliet_args['api_key'] = config['openai']['apikey']
if 'apibase' in config['openai']:
    cliet_args['base_url'] = config['openai']['apibase']
if 'model' in config['openai']:
    model = config['openai']['model']
else:
    model = "gpt-3.5-turbo"

(list(cliet_args.keys()), model)

(['api_key', 'base_url'], 'llama3')

In [36]:
client = OpenAI(**cliet_args)
client.base_url

URL('http://145.38.188.85/v1/')

In [37]:
# test the LLM server---create a completion
completion = client.chat.completions.create(
    model=model,
    messages=[{"role":"user","content":"Hello, world!"}],
    temperature=0
)
# print the completion
print(completion.choices[0].message.content)

Hello there! It's great to meet you. I'm your friendly AI assistant, here to help with any questions or topics you'd like to discuss. What brings you to this corner of the internet today?


## Elements to be inspected

In [38]:
methods = sorted(find_paths(edges['contains'], edges['hasScript']))
len(methods)

4653

In [39]:
classes = sorted({(pkg,clz) for pkg,clz,_ in methods})
len(classes)

604

In [40]:
packages = sorted({pkg for pkg,_ in classes})
len(packages),packages

(38,
 ['com.fsck.k9',
  'com.fsck.k9.account',
  'com.fsck.k9.activity',
  'com.fsck.k9.activity.compose',
  'com.fsck.k9.activity.loader',
  'com.fsck.k9.activity.misc',
  'com.fsck.k9.activity.setup',
  'com.fsck.k9.autocrypt',
  'com.fsck.k9.cache',
  'com.fsck.k9.controller',
  'com.fsck.k9.crypto',
  'com.fsck.k9.fragment',
  'com.fsck.k9.helper',
  'com.fsck.k9.helper.jsoup',
  'com.fsck.k9.mailstore',
  'com.fsck.k9.mailstore.migrations',
  'com.fsck.k9.mailstore.util',
  'com.fsck.k9.message',
  'com.fsck.k9.message.extractors',
  'com.fsck.k9.message.html',
  'com.fsck.k9.message.quote',
  'com.fsck.k9.message.signature',
  'com.fsck.k9.notification',
  'com.fsck.k9.power',
  'com.fsck.k9.preferences',
  'com.fsck.k9.provider',
  'com.fsck.k9.remotecontrol',
  'com.fsck.k9.search',
  'com.fsck.k9.service',
  'com.fsck.k9.setup',
  'com.fsck.k9.ui',
  'com.fsck.k9.ui.compose',
  'com.fsck.k9.ui.crypto',
  'com.fsck.k9.ui.dialog',
  'com.fsck.k9.ui.message',
  'com.fsck.k9.ui.me

In [41]:
hierarchy = {
    pkg_id: { 
        cls_id: [
			met_id for _,c,met_id in methods if c == cls_id
		] for p,cls_id in classes if p == pkg_id
	} for pkg_id in packages
}

hierarchy

{'com.fsck.k9': {'com.fsck.k9.Account': ['com.fsck.k9.Account.addCertificate(com.fsck.k9.activity.setup.AccountSetupCheckSettings$CheckDirection,java.security.cert.X509Certificate)',
   'com.fsck.k9.Account.allowRemoteSearch()',
   'com.fsck.k9.Account.cacheChips()',
   'com.fsck.k9.Account.com.fsck.k9.Account(android.content.Context)',
   'com.fsck.k9.Account.com.fsck.k9.Account(com.fsck.k9.Preferences,java.lang.String)',
   'com.fsck.k9.Account.delete(com.fsck.k9.Preferences)',
   'com.fsck.k9.Account.deleteCertificate(java.lang.String,int,com.fsck.k9.activity.setup.AccountSetupCheckSettings$CheckDirection)',
   'com.fsck.k9.Account.deleteCertificates()',
   'com.fsck.k9.Account.deleteIdentities(com.fsck.k9.preferences.Storage,com.fsck.k9.preferences.StorageEditor)',
   'com.fsck.k9.Account.equals(java.lang.Object)',
   'com.fsck.k9.Account.excludeSpecialFolder(com.fsck.k9.search.LocalSearch,java.lang.String)',
   'com.fsck.k9.Account.excludeSpecialFolders(com.fsck.k9.search.LocalSea

In [42]:
import time

timestr = time.strftime("%Y%m%d-%H%M%S")
timestr

'20240606-220730'

## Ask LLM to summarize methods, classes, and packages

In [43]:
method_prompt_template = '''This is method `{op_name}` of {struct_kind} `{struct_name}`:

```java
{op_src}
```

Explain the above method on the following aspects:

{{ what: "Describe the functionality of the method in one sentence.",
  parameters: [ {{ name:..., type:..., description:... }}, ... ],
  returns: {{ type:..., description: ... }}, // In case of a constructor, consider the constructed class as the return type.
  why: "Explain, in one sentence, the reason why the method is provided or the design rationale of the method.",
  howToUse: "Describe the usage or the expected set-up of using the method in less than 3 sentences."
  howItWorks: "Describe the implementation details of the method in less than 5 sentences."
  assertions: {{ preConditions: ["pre-conditions of the method", ...], postConditions: ["pre-conditions of the method", ...] }}
}}

Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values.'''


In [44]:
class_prompt_template = '''A Java {struct_type} `{struct_name}` contains the following field(s) and method(s):

Fields:

{fields}

Methods:

{methods}

Describe the responsibilities of this {struct_type}. Frame the class in one of the following role stereotypes:

- **Information Holder** is responsible for knowing facts and maintaining consistency of its information.

- **Service Provider** is responsible for handling requests and performing specific services.

- **Structurer** is responsible for managing connections and constraints among related things.

- **Controller** is responsible for making decisions, directing the work of others, and handling important events.

- **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects.

- **User Interfacer** is responsible for transmitting user requests for action or display information that can be updated.

- **External Interfacer** is responsible for handling faulty conditions in other systems they interface to, relieving their clients of having to know about lower-level details and recovery strategies.

- **Internal Interfacer** is responsible for delegating external requests to objects in its neighborhood.

And in one of the following architectural layer:

- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers.

- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers.

- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers.

- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers.

Answer in well-formatted JSON {{ "roleStereotype": ..., "layer": ..., "responsibility": ... }}'''


In [45]:
package_prompt_template = '''Given a Java package `{pkg_name}` containing the following classes:

{classes}

What is the purpose of this package? Frame the package in one of the following architectural layer:

- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers.

- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers.

- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers.

- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers.

Answer in well-formatted JSON {{ "layer": ..., "purpose": ... }}'''


In [46]:
import os

current_pkg = None
current_cls = None

with open(f'arcana-{timestr}.log', 'a') as file:
    
	try:
		for pkg_id,pkg_data in tqdm(hierarchy.items(), desc="Processing packages"):
			file.write('# ' +  pkg_id + "\n")
			package = nodes[pkg_id]
	
			for cls_id,cls_data in tqdm(pkg_data.items(), desc="Processing classes", leave=True):    
				file.write('\t* ' + cls_id + "\n")
				clasz = nodes[cls_id]

				class_name = clasz['properties']['qualifiedName']
				class_kind = clasz['properties']['kind']
				if class_kind == 'enumeration':
					class_kind = 'enum'
				elif class_kind == 'abstract':
					class_kind = 'abstract class'

				for met_id in tqdm(cls_data, desc='Processing methods', leave=True):
					
					if os.path.exists('stop'):
						raise StopIteration
		
					if not 'description' in nodes[met_id]['properties'] \
							or not nodes[met_id]['properties']['description']:

						file.write('\t\t- ' + met_id + "\n")

						method = nodes[met_id]

						method_name = method['properties']['simpleName']
						method_src = method['properties']['sourceText']

						prompt = method_prompt_template.format(
							op_name= method_name, 
							struct_kind= class_kind, 
							struct_name= class_name, 
							op_src= remove_java_comments(method_src))
						if only_print_prompt:
							file.write(prompt + "\n")
							file.write("\n")
						else:
							response = None
							try:
								response = client.chat.completions.create(
									model=model,
									response_format= { "type": "json_object" },
									messages=[
										{"role": "user","content": prompt}, 
										# {"role": "assistant","content": '{ "What": "'}
									],
									max_tokens=1024, # stop=[". "],
									temperature=0)
								description = response.choices[0].message.content
							except:
								description = '{}'
								print(response)
							file.write('\t\t\t ' + description.replace('\n', '\n\t\t\t') + "\n")
							try:
								method['properties']['description'] = json.loads(description)
							except:
								method['properties']['description'] = dict()
				file.flush()
				
				fields = {edge['target'] for edge in edges['hasVariable'] if edge['source'] == cls_id}
				fields = [remove_java_comments(nodes[field]['properties']['sourceText']) for field in fields]

				prompt = class_prompt_template.format(
						struct_type=class_kind, 
						struct_name=class_name, 
						fields="\n".join([f"- `{field}`" for field in fields]) if fields else "(no fields)",
						methods="\n".join([f"- `{nodes[met_id]['properties']['simpleName']}`: {str(nodes[met_id]['properties'].get('description', '(no description)'))}" 
								for met_id in cls_data])) if cls_data else "(no methods)"
				
				file.write(prompt + "\n")
				file.write("\n")
				if only_print_prompt:
					pass
				else:
					response = None
					try:
						# file.write(prompt + "\n")
						response = client.chat.completions.create(
							model=model,
							response_format= { "type": "json_object" },
							messages=[
								{"role": "user", "content": prompt}, 
								# {"role": "assistant", "content": "{"}
							],
							max_tokens=1024, 
							# stop=[". "],
							temperature=0)
						description = response.choices[0].message.content
					except:
						description = "{}"
						file.write(response + "\n")
					file.write('\t\t '+ description + "\n")
					try:
						nodes[cls_id]['properties']['description'] = json.loads(description)
					except:
						nodes[cls_id]['properties']['description'] = dict()
			
			prompt = package_prompt_template.format(
				pkg_name= package['properties']['qualifiedName'],
				classes= "\n".join([f"- {nodes[cls_id]['properties']['kind']} `{nodes[cls_id]['properties']['qualifiedName']}`: {str(nodes[cls_id]['properties'].get('description', '(no description)'))}" 
								for cls_id, clasz in pkg_data.items()])
			)
			
			file.write(prompt + "\n")
			file.write("\n")
			if only_print_prompt:
				pass
			else:
				response = None
				try:
					response = client.chat.completions.create(
						model=model,
						response_format= { "type": "json_object" },
						messages=[
							{"role": "user", "content": prompt}, 
							# {"role": "assistant", "content": f"The package `{package_name}` is a package that"}
						],
						max_tokens=1024, 
						# stop=[". "],
						temperature=0)
					description = response.choices[0].message.content
				except:
					description = '{}'
					file.write(response + "\n")
				file.write('\t' + description + "\n")
				try:
					nodes[pkg_id]['properties']['description'] = json.loads(description)
				except:
					nodes[pkg_id]['properties']['description'] = dict()
	except StopIteration:
		pass
   
print(prettify_json(hierarchy))


Processing packages:   0%|          | 0/38 [00:00<?, ?it/s]

Processing classes:   0%|          | 0/35 [00:00<?, ?it/s]

Processing methods:   0%|          | 0/173 [00:00<?, ?it/s]

In [None]:
# from statistics import median, quantiles

# num_classes_per_pkg = [len(pkg_desc['classes']) for _, pkg_desc in hierarchy.items()]
# num_methods_per_class =  [len(class_desc['methods']) for _, pkg_desc in hierarchy.items() for _, class_desc in pkg_desc['classes'].items()]
# len(hierarchy), sum(num_classes_per_pkg), min(num_classes_per_pkg), max(num_classes_per_pkg), median(num_classes_per_pkg), quantiles(num_classes_per_pkg, n=4), sum(
#     num_methods_per_class), min(num_methods_per_class), max(num_methods_per_class), median(num_methods_per_class), quantiles(num_methods_per_class, n=4)


In [None]:
# if not only_print_prompt:
#   write_to_json_file(hierarchy, f"{project_name}-goals_1-{model}.json")

In [None]:
# goals = read_json_file(f"{project_name}-goals_1-{model}.json")
# goals

## Ask LLM to summarize classes (based on methods)

In [None]:

current_pkg = None

with open(f'summarizer-{timestr}.log', 'a') as file:
	for pkg_id,cls_id in tqdm(classes, desc='Processing methods'):

		if not 'description' in nodes[cls_id]['properties'] \
				or not nodes[cls_id]['properties']['description']:

			if current_pkg != pkg_id:
				file.flush()
				last_pkg = current_pkg
				current_pkg = pkg_id
				file.write('# '+  current_pkg + "\n")
			
			file.write('\t* ' + cls_id + "\n")

			package = nodes[pkg_id]
			clasz = nodes[cls_id]

			class_name = clasz['properties']['qualifiedName']
			class_kind = clasz['properties']['kind']
			if class_kind == 'enumeration':
				class_kind = 'enum'
			elif class_kind == 'abstract':
				class_kind = 'abstract class'

			fields = {edge['target'] for edge in edges['hasVariable'] if edge['source'] == cls_id}
			fields = [remove_java_comments(nodes[field]['properties']['sourceText']) for field in fields]

			prompt = class_prompt_template.format(
					struct_type=class_kind, 
					struct_name=class_name, 
					fields="\n".join([f"- `{field}`" for field in fields]) if fields else "(no fields)",
					methods="\n".join([f"- `{nodes[met_id]['properties']['simpleName']}`: {str(method['description'])}" for met_id,method in hierarchy[pkg_id]['classes'][cls_id]['methods'].items()])) if methods else "(no methods)"
			
			if only_print_prompt:
				file.write(prompt + "\n")
				file.write("\n")
			else:
				response = None
				try:
					# file.write(prompt + "\n")
					response = client.chat.completions.create(
						model=model,
						response_format= { "type": "json_object" },
						messages=[
							{"role": "user", "content": prompt}, 
							# {"role": "assistant", "content": "{"}
						],
						max_tokens=1024, 
						# stop=[". "],
						temperature=0)
					description = response.choices[0].message.content
				except:
					description = "{}"
					file.write(response + "\n")
				file.write('\t\t '+ description + "\n")
				nodes[cls_id]['properties']['description'] = json.loads(description)

print(prettify_json(hierarchy))


In [None]:
# if not only_print_prompt:
#   write_to_json_file(hierarchy, f"{project_name}-goals_2-{model}.json")


In [None]:
# goals = read_json_file(f"{project_name}-goals_2-{model}.json")
# goals.keys()

## Ask LLM to summarize packages (based on classes)

In [None]:

nl = '\n'

with open(f'summarizer-{timestr}.log', 'a') as file:
	for pkg_id in tqdm(packages, desc='Processing methods'):

		if not 'description' in nodes[pkg_id]['properties'] \
				or not nodes[pkg_id]['properties']['description']:
		
			file.write('# ' + pkg_id + "\n")
			
			package = hierarchy[pkg_id]
			package_name = nodes[pkg_id]['properties']['qualifiedName']

			if hierarchy[pkg_id]['classes']:
				prompt = package_prompt_template.format(
				pkg_name= package_name,
				classes= "\n".join([f"- {nodes[cls_id]['properties']['kind']} `{clasz['qualifiedName']}`: {str(clasz.get('description', '(no description)'))}" 
									for cls_id, clasz in package['classes'].items()])
				)
				
				if only_print_prompt:
					file.write(prompt + "\n")
				else:
					response = None
					try:
						response = client.chat.completions.create(
							model=model,
							response_format= { "type": "json_object" },
							messages=[
								{"role": "user", "content": prompt}, 
								# {"role": "assistant", "content": f"The package `{package_name}` is a package that"}
							],
							max_tokens=1024, 
							# stop=[". "],
							temperature=0)
						description = response.choices[0].message.content
					except:
						description = '{}'
						file.write(response + "\n")
					file.write('\t' + description + "\n")
					nodes[pkg_id]['properties']['description'] = json.loads(description)

print(prettify_json(hierarchy))


In [None]:
# if not only_print_prompt:
#   write_to_json_file(hierarchy, f"{project_name}-goals_3-{model}.json")


In [None]:
# hierarchy = read_json_file(f"{project_name}-goals_3-{model}.json")
# hierarchy.keys()

## Which methods/classes/packages could not be summarized by the LLM?

In [None]:
# method_no_desc = [
#     (pkg_name, class_name, method_name)
#     for pkg_name, pkg_desc in hierarchy.items()
#     for class_name, class_desc in pkg_desc['classes'].items()
#     for method_name in class_desc['methods']
#     if class_desc['methods'][method_name]['description'] == "(no description)"
# ]

# print(prettify_json(method_no_desc))


In [None]:
# class_no_desc = [
#     (pkg_name, class_name)
#     for pkg_name, pkg_desc in hierarchy.items()
#     for class_name in pkg_desc['classes']
#     if pkg_desc['classes'][class_name]['description'] == "(no description)"
# ]

# print(prettify_json(class_no_desc))


In [None]:
# package_no_desc = [
#     pkg_name
#     for pkg_name in hierarchy
#     if hierarchy[pkg_name]['description'] == "(no description)"
# ]

# print(prettify_json(package_no_desc))


## Add the summaries back to the graph nodes

In [None]:
# for pkg_id,package in hierarchy.items():
# 	nodes[pkg_id]['properties']['description'] = package['description']
	
# 	for cls_id,clasz in package['classes'].items():
# 		nodes[cls_id]['properties']['description'] = clasz['description']

# 		for met_id,method in clasz['methods'].items():
# 			nodes[met_id]['properties']['description'] = method['description']

In [None]:
nodes

In [None]:
graph['elements']['nodes'] = [{'data':node_data} for node_data in nodes.values()]

In [None]:
write_to_json_file(graph,f'{project_name}-with-summaries.json')