## Imports & helper functions

In [None]:
%pip install --upgrade openai
%pip install --upgrade tqdm

In [None]:
import json
from tqdm import tqdm

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data


def parse_json(json_string):
    json_dict = json.loads(json_string)
    return json_dict


def prettify_json(obj):
    pretty_json = json.dumps(obj, indent=2)
    return pretty_json


def write_to_json_file(obj, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(obj, json_file, indent=2)


In [None]:
import configparser

def read_ini_file(file_path):
    config = configparser.ConfigParser()
    config.read(file_path)
    ini_dict = {section: dict(config.items(section))
                for section in config.sections()}
    return ini_dict


In [None]:
from openai import OpenAI

In [None]:
import re


def remove_java_comments(java_source):
    # Regular expression to match Java comments (both single-line and multi-line)
    pattern = r"(//.*?$)|(/\*.*?\*/)"

    # Remove comments using the regular expression
    java_source_without_comments = re.sub(
        pattern, "", java_source, flags=re.MULTILINE | re.DOTALL)

    return java_source_without_comments.strip()


In [None]:
def sentence(s):
    '''
    Capitalize the first letter of a string `s` and ensures that the string 
    ends with a period (if it's not already a sentence-ending punctuation).
    '''
    t = s.strip()
    if t[-1] in '.?!…~–—':
        return f'{t[0].upper()}{t[1:]}'
    else:
        return f'{t[0].upper()}{t[1:]}.'

In [None]:
def transform_graph(graph):
	nodes = { node['data']['id']: node['data'] for node in graph['elements']['nodes'] }
	edges = {}
	for edge in graph['elements']['edges']:
		if 'label' in edge['data']:
			label = edge['data']['label']
		else:
			label = ','.join(edge['data']['labels'])
			edge['data']['label'] = label
		
		if label not in edges:
			edges[label] = []
		edges[label].append(edge['data'])
	return (nodes, edges)
	

In [None]:
def invert(edgeList):
    prefix = "inv_"
    invertedEdges = []
    for edge in edgeList:
        invertedEdge = {
            'source': edge['target'],
            'target': edge['source'],
            'label': prefix + edge.get('label', ''),
            **{key: value for key, value in edge.items() if key not in ['source', 'target', 'label']}
        }
        invertedEdges.append(invertedEdge)
    return invertedEdges

def find_paths(edgeList1, edgeList2):
    source_mapping = {}
    for edge in edgeList1:
        source_mapping[edge['target']] = edge['source']

    paths = set()
    for edge in edgeList2:
        if edge['source'] in source_mapping:
            source1 = source_mapping[edge['source']]
            path = [source1, edge['source'], edge['target']]
            paths.add(tuple(path))

    return paths

## Parameters

In [None]:
# If True: do not call the API, just print the prompts
only_print_prompt = False

In [None]:
config = read_ini_file('config.ini')
project_name = config['project']['name']
project_name
ifile = config['project']['ifile']

## Read graph file

To access knowledge graph extracted using javapers.

In [None]:
graph = read_json_file(ifile)
nodes,edges = transform_graph(read_json_file(ifile))

## Connect to openai

In [None]:
cliet_args = dict()

if 'apikey' in config['openai']:
    cliet_args['api_key'] = config['openai']['apikey']
if 'apibase' in config['openai']:
    cliet_args['base_url'] = config['openai']['apibase']
if 'model' in config['openai']:
    model = config['openai']['model']
else:
    model = "gpt-3.5-turbo"

(list(cliet_args.keys()), model)

In [None]:
client = OpenAI(**cliet_args)
client.base_url

In [None]:
# test the LLM server---create a completion
completion = client.chat.completions.create(
    model=model,
    messages=[{"role":"user","content":"What is your name?"}],
    temperature=0
)
# print the completion
print(completion.choices[0].message.content)

## Elements to be inspected

In [None]:
methods = sorted(find_paths(edges['contains'], edges['hasScript']))
len(methods)

In [None]:
classes = sorted({(pkg,clz) for pkg,clz,_ in methods})
len(classes)

In [None]:
packages = sorted({pkg for pkg,_ in classes})
len(packages),packages

In [None]:
goals = {pkg_id:{
	'qualifiedName': nodes[pkg_id]['properties']['qualifiedName'],
	'classes': {cls_id: {
		'qualifiedName': nodes[cls_id]['properties']['qualifiedName'],
		'kind': nodes[cls_id]['properties']['kind'],
		'methods': {met_id: {
			'qualifiedName': nodes[met_id]['properties']['qualifiedName']
		} for _,c,met_id in methods if c == cls_id}
	} for p,cls_id in classes if p == pkg_id}
} for pkg_id in packages}

# goals

## Ask LLM to summarize methods

In [None]:
method_prompt_template = '''This is method `{op_name}` of {struct_kind} `{struct_name}`:

```java
{op_src}
```

Explain the above method on the following aspects:

* What: Describe the functionality of the method in one sentence.

* Parameters: A list of parameter names, types, and descriptions.

* Returns: The return type and description. In case of a constructor, consider the constructed class as the return type.

* Why: Explain, in one sentence, the reason why the method is provided or the design rationale of the method.

* How-to-use: Describe the usage or the expected set-up of using the method in less than 3 sentences.

* How-it-is-done: Describe the implementation details of the method in less than 5 sentences.

* Property: Assert properties of the method including (list of) pre-conditions and/or (list of) post-conditions of the method.

Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values.
'''

current_pkg = None
current_cls = None

for pkg_id,cls_id,met_id in methods: # tqdm(methods, desc='Processing methods'):
    
    # if not 'description' in goals[pkg_id]['classes'][cls_id]['methods'][met_id] \
    #         or goals[pkg_id]['classes'][cls_id]['methods'][met_id]['description'] == '{}':

        if current_pkg != pkg_id:
            last_pkg = current_pkg
            current_pkg = pkg_id
            print('#', current_pkg)
        
        if current_cls != cls_id:
            last_cls = current_cls
            current_cls = cls_id
            print('\t*', current_cls)

        print('\t\t-', met_id)

        package = nodes[pkg_id]
        clasz = nodes[cls_id]
        method = nodes[met_id]

        class_name = clasz['properties']['qualifiedName']
        class_kind = clasz['properties']['kind']
        if class_kind == 'enumeration':
            class_kind = 'enum'
        elif class_kind == 'abstract':
            class_kind = 'abstract class'

        method_name = method['properties']['simpleName']
        method_src = method['properties']['sourceText']

        prompt = method_prompt_template.format(
            op_name= method_name, 
            struct_kind= class_kind, 
            struct_name= class_name, 
            op_src= remove_java_comments(method_src))
        if only_print_prompt:
            print(prompt)
            print()
        else:
            response = None
            try:
                response = client.chat.completions.create(
                    model=model,
                    response_format= { "type": "json_object" },
                    messages=[
                        {"role": "user","content": prompt}, 
                        # {"role": "assistant","content": '{ "What": "'}
					],
                    max_tokens=1024, # stop=[". "],
                    temperature=0)
                description = response.choices[0].message.content
            except:
                description = '{}'
                print(response)
            print('\t\t\t', description.replace('\n', '\n\t\t\t'))
            goals[pkg_id]['classes'][cls_id]['methods'][met_id]['description'] = json.loads(description)

print(prettify_json(goals))


In [None]:
from statistics import median, quantiles

num_classes_per_pkg = [len(pkg_desc['classes']) for _, pkg_desc in goals.items()]
num_methods_per_class =  [len(class_desc['methods']) for _, pkg_desc in goals.items() for _, class_desc in pkg_desc['classes'].items()]
len(goals), sum(num_classes_per_pkg), min(num_classes_per_pkg), max(num_classes_per_pkg), median(num_classes_per_pkg), quantiles(num_classes_per_pkg, n=4), sum(
    num_methods_per_class), min(num_methods_per_class), max(num_methods_per_class), median(num_methods_per_class), quantiles(num_methods_per_class, n=4)


In [None]:
if not only_print_prompt:
  write_to_json_file(goals, f"{project_name}-goals_1-{model}.json")

In [None]:
# goals = read_json_file(f"{project_name}-goals_1-{model}.json")
# goals

## Ask LLM to summarize classes (based on methods)

In [None]:
class_prompt_template = '''A Java {struct_type} `{struct_name}` contains the following field(s) and method(s):

Fields:

{fields}

Methods:

{methods}

Describe the responsibilities of this {struct_type}. Frame the class in one of the following role stereotypes:

- **Information Holder** is responsible for knowing facts and maintaining consistency of its information.

- **Service Provider** is responsible for handling requests and performing specific services.

- **Structurer** is responsible for managing connections and constraints among related things.

- **Controller** is responsible for making decisions, directing the work of others, and handling important events.

- **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects.

- **User Interfacer** is responsible for transmitting user requests for action or display information that can be updated.

- **External Interfacer** is responsible for handling faulty conditions in other systems they interface to, relieving their clients of having to know about lower-level details and recovery strategies.

- **Internal Interfacer** is responsible for delegating external requests to objects in its neighborhood.

And in one of the following architectural layer:

- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers.

- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers.

- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers.

- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers.

Answer in well-formatted JSON { "Role-Stereotype": ..., "Layer": ..., "Responsibility": ... }'''

current_pkg = None

for pkg_id,cls_id in tqdm(classes, desc='Processing methods'):

    if not 'description' in goals[pkg_id]['classes'][cls_id] \
            or goals[pkg_id]['classes'][cls_id]['description'] == '(no description)':

        if current_pkg != pkg_id:
            last_pkg = current_pkg
            current_pkg = pkg_id
            print('#', current_pkg)
        
        print('\t*', cls_id)

        package = nodes[pkg_id]
        clasz = nodes[cls_id]

        class_name = clasz['properties']['qualifiedName']
        class_kind = clasz['properties']['kind']
        if class_kind == 'enumeration':
            class_kind = 'enum'
        elif class_kind == 'abstract':
            class_kind = 'abstract class'

        fields = {edge['target'] for edge in edges['hasVariable'] if edge['source'] == cls_id}
        fields = [remove_java_comments(nodes[field]['properties']['sourceText']) for field in fields]

        prompt = class_prompt_template.format(
                struct_type=class_kind, 
                struct_name=class_name, 
                fields="\n".join([f"- `{field}`" for field in fields]) if fields else "(no fields)",
                methods="\n".join([f"- `{nodes[met_id]['properties']['simpleName']}`: {method['description']}" for met_id,method in goals[pkg_id]['classes'][cls_id]['methods'].items()])) if methods else "(no methods)"
        
        if only_print_prompt:
            print(prompt)
            print()
        else:
            response = None
            try:
                # print(prompt)
                response = client.chat.completions.create(
                    model=model,
                    response_format= { "type": "json_object" },
                    messages=[
                        {"role": "user", "content": prompt}, 
                        # {"role": "assistant", "content": "{"}
					],
                    max_tokens=1024, 
                    # stop=[". "],
                    temperature=0)
                description = response.choices[0].message.content
            except:
                description = "{}"
                print(response)
            print('\t\t', description)
            goals[pkg_id]['classes'][cls_id]['description'] = json.loads(description)

print(prettify_json(goals))


In [None]:
if not only_print_prompt:
  write_to_json_file(goals, f"{project_name}-goals_2-{model}.json")


In [None]:
# goals = read_json_file(f"{project_name}-goals_2-{model}.json")
# goals.keys()

## Ask LLM to summarize packages (based on classes)

In [None]:
package_prompt_template = '''Given a Java package `{pkg_name}` containing the following classes:

{classes}

What is the purpose of this package? Frame the package in one of the following architectural layer:

- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers.

- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers.

- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers.

- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers.

Answer in well-formatted JSON { "Layer": ..., "Purpose": ... }'''

nl = '\n'

for pkg_id in tqdm(packages, desc='Processing methods'):

    if not 'description' in goals[pkg_id] \
            or goals[pkg_id]['description'] == '(no description)':
      
        print('#', pkg_id)
        
        package = goals[pkg_id]
        package_name = nodes[pkg_id]['properties']['qualifiedName']

        if goals[pkg_id]['classes']:
            prompt = package_prompt_template.format(
              pkg_name= package_name,
              classes= "\n".join([f"- {nodes[cls_id]['properties']['kind']} `{clasz['qualifiedName']}`: {clasz.get('description', '(no description)').replace(nl, '')}" 
                                  for cls_id, clasz in package['classes'].items()])
            )
            
            if only_print_prompt:
                print(prompt)
            else:
                response = None
                try:
                    response = client.chat.completions.create(
                        model=model,
                        messages=[
                            {"role": "user", "content": prompt}, 
                            # {"role": "assistant", "content": f"The package `{package_name}` is a package that"}
						],
                        max_tokens=1024, 
                        # stop=[". "],
                        temperature=0)
                    description = response.choices[0].message.content
                except:
                    description = '{}'
                    print(response)
                print('\t', description)
                goals[pkg_id]['description'] = json.loads(description)

print(prettify_json(goals))


In [None]:
if not only_print_prompt:
  write_to_json_file(goals, f"{project_name}-goals_3-{model}.json")


In [None]:
goals = read_json_file(f"{project_name}-goals_3-{model}.json")
goals.keys()

## Which methods/classes/packages could not be summarized by the LLM?

In [None]:
method_no_desc = [
    (pkg_name, class_name, method_name)
    for pkg_name, pkg_desc in goals.items()
    for class_name, class_desc in pkg_desc['classes'].items()
    for method_name in class_desc['methods']
    if class_desc['methods'][method_name]['description'] == "(no description)"
]

print(prettify_json(method_no_desc))


In [None]:
class_no_desc = [
    (pkg_name, class_name)
    for pkg_name, pkg_desc in goals.items()
    for class_name in pkg_desc['classes']
    if pkg_desc['classes'][class_name]['description'] == "(no description)"
]

print(prettify_json(class_no_desc))


In [None]:
package_no_desc = [
    pkg_name
    for pkg_name in goals
    if goals[pkg_name]['description'] == "(no description)"
]

print(prettify_json(package_no_desc))


## Add the summaries back to the graph nodes

In [None]:
for pkg_id,package in goals.items():
	nodes[pkg_id]['properties']['description'] = package['description']
	
	for cls_id,clasz in package['classes'].items():
		nodes[cls_id]['properties']['description'] = clasz['description']

		for met_id,method in clasz['methods'].items():
			nodes[met_id]['properties']['description'] = method['description']

In [None]:
nodes

In [None]:
graph['elements']['nodes'] = [{'data':node_data} for node_data in nodes.values()]

In [None]:
write_to_json_file(graph,f'{project_name}-with-summaries.json')