## Imports & helper functions

In [None]:
%pip install --upgrade py2neo
%pip install --upgrade openai

In [None]:
from py2neo import Graph

In [None]:
from itertools import groupby

In [None]:
import json


def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data


def parse_json(json_string):
  json_dict = json.loads(json_string)
  return json_dict


def dict_to_pretty_json(dictionary):
  pretty_json = json.dumps(dictionary, indent=2)
  return pretty_json


def write_dict_to_json(dictionary, file_path):
  with open(file_path, 'w') as json_file:
    json.dump(dictionary, json_file, indent=2)


In [None]:
import configparser

def read_ini_file(file_path):
    config = configparser.ConfigParser()
    config.read(file_path)
    ini_dict = {section: dict(config.items(section))
                for section in config.sections()}
    return ini_dict


In [None]:
import openai

## Parameters

In [None]:
only_print_prompt = False # do not call the API, just print the prompts

In [None]:
secrets = read_ini_file('secrets.ini')
project_name = 'k9mail'

## Connect to neo4j

In [None]:
graph = Graph(secrets['neo4j']['url'], auth=(secrets['neo4j']['username'], secrets['neo4j']['password']))

## Connect to openai

In [None]:
openai.api_key = secrets['openai']['apikey']
model = "gpt-3.5-turbo"


## Packages to be inspected

In [None]:
# For k9mail
packages = [
    "com.fsck.k9",
    "com.fsck.k9.account",
    "com.fsck.k9.activity",
    "com.fsck.k9.activity.compose",
    "com.fsck.k9.activity.loader",
    "com.fsck.k9.activity.misc",
    "com.fsck.k9.activity.setup",

    "com.fsck.k9.autocrypt",
    "com.fsck.k9.cache",
    "com.fsck.k9.controller",
    "com.fsck.k9.crypto",
    "com.fsck.k9.fragment",
    "com.fsck.k9.helper",
    "com.fsck.k9.helper.jsoup",

    "com.fsck.k9.mailstore",
    "com.fsck.k9.mailstore.migrations",
    "com.fsck.k9.mailstore.util",
    "com.fsck.k9.message",
    "com.fsck.k9.message.extractors",
    "com.fsck.k9.message.html",
    "com.fsck.k9.message.quote",
    "com.fsck.k9.message.signature",

    "com.fsck.k9.ui",
    "com.fsck.k9.ui.compose",
    "com.fsck.k9.ui.crypto",
    "com.fsck.k9.ui.dialog",
    "com.fsck.k9.ui.message",
    "com.fsck.k9.ui.messageview",

    "com.fsck.k9.service",
    "com.fsck.k9.setup",
    "com.fsck.k9.search",
    "com.fsck.k9.view",
    "com.fsck.k9.widget.list",
    
    "com.fsck.k9.notification",
    "com.fsck.k9.power",
    "com.fsck.k9.preferences",
    "com.fsck.k9.provider",
    "com.fsck.k9.remotecontrol",
    "com.fsck.k9.search",
]

# For k9mail-library
# packages = [
#     "com.fsck.k9.mail",
    
#     "com.fsck.k9.mail.filter",
#     "com.fsck.k9.mail.helper",
#     "com.fsck.k9.mail.internet",
#     "com.fsck.k9.mail.message",
    
#     "com.fsck.k9.mail.oauth",
#     "com.fsck.k9.mail.power",
#     "com.fsck.k9.mail.ssl",
#     "com.fsck.k9.mail.store.imap",
#     "com.fsck.k9.mail.store.pop3",
#     "com.fsck.k9.mail.store.webdav",
    
#     "com.fsck.k9.mail.store",
# ]

## Ask LLM to summarize methods

In [None]:
method_goals = dict()
method_prompt_template = '''This is method `{}` of a Java {} `{}`:

```java
{}
```

In one sentence, what this method does is it'''
for package in packages:
  print(package)
  node_data = graph.run('MATCH (p:Container)-[:contains]->(c:Structure)-[:hasScript]->(m:Operation)'
                        f'WHERE p.qualifiedName="{package}" AND m.visibility="public" AND m.kind="method"'
                        'RETURN p.qualifiedName, c.qualifiedName, c.kind, m.simpleName, m.sourceText')
  method_goals[package] = dict()
  grouped_node_data = groupby(node_data, lambda x: (x['c.kind'],x['c.qualifiedName']))
  for (kind,class_name), methods_data in grouped_node_data:
    print("  " + class_name)
    if kind == 'enumeration':
      kind = 'enum'
    elif kind == 'abstract':
      kind = 'abstract class'
    method_goals[package][f'{kind} {class_name}'] = []
    for row in methods_data:
      # print("- " + row['m.simpleName'])
      prompt = method_prompt_template.format(
          row["m.simpleName"], 
          kind, 
          row["c.qualifiedName"], 
          row["m.sourceText"])
      if only_print_prompt:
        print(prompt)
      else:
        try:
          response = openai.ChatCompletion.create(
            model=model,
            messages=[{
              "role": "user",
              "content": prompt
            }])
          method_goal = (row['m.simpleName'],
                         response['choices'][0]['message']['content'])
        except:
          method_goal = (row['m.simpleName'], "(no description)")
        method_goals[package][f'{kind} {class_name}'].append(method_goal)
    print("    " + str(method_goals[package][f'{kind} {class_name}']))
    # break
  print("  " + str(method_goals[package]))
  print()
  # break
print(method_goals)


In [None]:
len(method_goals)

In [None]:
if not only_print_prompt:
  write_dict_to_json(method_goals, f"{project_name}-method_goals.json")

In [None]:
# method_goals = read_json_file(f"{project_name}-method_goals.json")


## Ask LLM to summarize classes (based on methods)

In [None]:
class_goals = dict()
class_prompt_template = '''A Java {} `{}` contains the following public method(s):

{}

In one sentence, what the {} `{}` does is it'''
for package,classes in method_goals.items():
  print(package)
  class_goals[package] = []
  for key,methods in classes.items():
    tokens = key.split(' ')
    kind = ' '.join(tokens[:-1])
    class_name = tokens[-1]
    if methods:
      prompt = class_prompt_template.format(
        kind, 
        class_name, 
        "\n".join([f"- `{name}`: {desc}" for name, desc in methods]), 
        kind, 
        class_name)
    else:
      prompt = f'A Java {kind} `{class_name}` contains no public methods. In one sentence, based on the name alone, what the {kind} `{class_name}` does is it'
    if only_print_prompt:
      print(prompt)
    else:
      try:
        response = openai.ChatCompletion.create(
          model=model,
          messages=[{
              "role": "user",
              "content": prompt
          }])
        class_goal = ((kind, class_name),
                      response['choices'][0]['message']['content'])
      except:
        class_goal = ((kind, class_name), '(no description)')
      class_goals[package].append(class_goal)
  print("  " + str(class_goals[package]))
print()
print(class_goals)


In [None]:
if not only_print_prompt:
  write_dict_to_json(class_goals, f"{project_name}-class_goals.json")


In [None]:
# class_goals = read_json_file(f"{project_name}-class_goals.json")


## Ask LLM to summarize packages (based on classes)

In [None]:
package_goals = []
package_prompt_template = '''Given a Java package `{}` containing the following classes:

{}

In one sentence, what the package `{}` does is it'''
for package, classes in class_goals.items():
  prompt = package_prompt_template.format(
    package,
    "\n".join([f"- {kind} `{name}`: {desc}" for (kind, name), desc in classes]),
    package)
  if only_print_prompt:
    print(prompt)
  else:
    try:
      response = openai.ChatCompletion.create(
        model=model,
        messages=[{
            "role": "user",
            "content": prompt
        }])
      package_goal = (package, response['choices'][0]['message']['content'])
    except:
      package_goal = (package, '(no description)')
    print(package_goal)
    package_goals.append(package_goal)
print()
print(package_goals)


In [None]:
if not only_print_prompt:
  write_dict_to_json(package_goals, f"{project_name}-package_goals.json")


In [None]:
# package_goals = read_json_file(f"{project_name}-package_goals.json")


## Which methods/classes/packages could not be summarized by the LLM?

In [None]:
method_no_desc = [
    (key1, key2, item[0])
    for key1, value1 in method_goals.items()
    for key2, value2 in value1.items()
    for item in value2
    if item[1] == "(no description)"
]

print(dict_to_pretty_json(method_no_desc))


In [None]:
class_no_desc = [
    (key1, item[0])
    for key1, value1 in class_goals.items()
    for item in value1
    if item[1] == "(no description)"
]

print(dict_to_pretty_json(class_no_desc))


In [None]:
package_no_desc = [
    item[0]
    for value1 in package_goals
    for item in value1
    if item[1] == "(no description)"
]

print(dict_to_pretty_json(package_no_desc))


## Print everything

In [None]:
for package, package_desc in package_goals:
  print(f'# package `{package}`')
  print()
  print("This package", package_desc)
  print()
  print('This package contains the following class(es):')
  print()
  for (kind, class_name), class_desc in class_goals[package]:
    print(f'## {kind} `{class_name}`')
    print()
    print(f'This {kind}', class_desc)
    print()
    print('This class contains the following public method(s):')
    print()
    for method, method_desc in method_goals[package][f'{kind} {class_name}']:
      print(f'- `{method}` {method_desc}')
    print()
  print()

## Generate prompts for decomposing package goals into subgoals

This part requires manual tuning of prompts and "conditioning" the LLM to get the desired results.

In [None]:
for package, package_desc in package_goals:
  print(f'Java package `{package}` contains the following class(es):')
  print()
  for (kind, class_name), class_desc in class_goals[package]:
    print(f'- {kind} `{class_name}`: {class_desc}')
  print()
  print(f'The package `{package}` {package_desc} Its subgoals are (a subgoal encompasses several classes with common or similar goals):\n\n1.\n\n')
