## Project Parameters

In [1]:
project_name = "k9mail-5.304"
project_root = "Datasets/k9mail/"
src_path = "k9mail-5.304/k9mail/src/main/java/"
graph_path = "k9mail-5.304.json"

(project_name,project_root,src_path,graph_path)

('k9mail-5.304',
 'Datasets/k9mail/',
 'k9mail-5.304/k9mail/src/main/java/',
 'k9mail-5.304.json')

## Imports

In [2]:
import random
import json

## Relationship Operations

In [3]:
def transitive_closure(tuples1, tuples2):
    result = set()

    # Create a dictionary to store intermediate values
    intermediate_dict = {}

    # Populate the intermediate dictionary with values from list2
    for b1, c1 in tuples2:
        if b1 not in intermediate_dict:
            intermediate_dict[b1] = [c1]
        else:
            intermediate_dict[b1].append(c1)

    # Iterate through list1 and find transitive pairs
    for a, b in tuples1:
        if b in intermediate_dict:
            for c in intermediate_dict[b]:
                result.add((a, c))

    return result

# Example usage:
t1 = [(1, 2), (2, 3), (3, 4)]
t2 = [(2, 'a'), (3, 'b'), (4, 'c'), (4, 'd'), (3, 'c')]

result = transitive_closure(t1, t2)
print(result)


{(3, 'c'), (1, 'a'), (2, 'b'), (3, 'd'), (2, 'c')}


In [4]:
def flip_tuples(tuples):
    return {(b, a) for a, b in tuples}

# Example usage:
t = [(1, 2), (3, 4), ('a', 'b')]
flipped = flip_tuples(t)

print(flipped)


{('b', 'a'), (2, 1), (4, 3)}


## Load javapers output

This is the "detailed" graph, i.e., it contains packages, classes, methods, variables. This is a labeled property graph, so a node has id, label, and properties, while an edge has source, target, type, and properties. In both, a property is an arbitrary key-value pair.

In [5]:
with open(project_root+graph_path, encoding="utf8") as json_file:
    data = json.load(json_file)

# samples: first node and first edge in the dataset, just for illustration of how they look like
data['elements']['nodes'][0], data['elements']['edges'][0]

({'data': {'id': 'com.fsck.k9.activity.compose.RecipientLoader.PROJECTION_CRYPTO_ADDRESSES',
   'properties': {'sourceText': 'private static final java.lang.String[] PROJECTION_CRYPTO_ADDRESSES = new java.lang.String[]{ "address", "uid_address" };',
    'visibility': 'private',
    'simpleName': 'PROJECTION_CRYPTO_ADDRESSES',
    'kind': 'field',
    'metaSrc': 'source code'},
   'labels': ['Variable']}},
 {'data': {'id': 'ac7cba3e978bef0c69a74a9b1b4a1e08',
   'source': 'com.fsck.k9.notification.NotificationController.getAccountName(com.fsck.k9.Account).account',
   'label': 'type',
   'properties': {'weight': 1, 'metaSrc': 'source code'},
   'target': 'com.fsck.k9.Account'}})

Put the data in dictionaries with simpler structure for the sake of further processing.

The nodes are now represented as a dictionary where the key is a tuple of (node_id, node_label) and the value is the node properties.

Similarly, the edges are now represented as a dictionary where the key is a tuple of (source, target, edge_type) and the value is the edge properties.

In [6]:
nodes = {(node['data']['id'],node['data']['labels'][0]):node['data']['properties'] for node in data['elements']['nodes']}
edges = {(edge['data']['source'],edge['data']['target'],edge['data']['label']):edge['data']['properties'] for edge in data['elements']['edges']}

(len(nodes), len(edges)), random.sample(nodes.items(), 5), random.sample(edges.items(), 5)

((13432, 28061),
 [(('com.fsck.k9.notification.CertificateErrorNotifications.com.fsck.k9.notification.CertificateErrorNotifications(com.fsck.k9.notification.NotificationController).controller',
    'Variable'),
   {'simpleName': 'controller',
    'kind': 'parameter',
    'metaSrc': 'source code'}),
  (('com.fsck.k9.activity.compose.RecipientPresenter.onToFocused()',
    'Operation'),
   {'sourceText': 'void onToFocused() {\r\n    lastFocusedType = com.fsck.k9.mail.Message.RecipientType.TO;\r\n}',
    'docComment': '',
    'visibility': 'default',
    'simpleName': 'onToFocused()',
    'kind': 'method',
    'metaSrc': 'source code'}),
  (('com.fsck.k9.activity.MessageCompose.INVALID_DRAFT_ID', 'Variable'),
   {'sourceText': 'private static final long INVALID_DRAFT_ID = com.fsck.k9.controller.MessagingController.INVALID_MESSAGE_ID;',
    'visibility': 'private',
    'simpleName': 'INVALID_DRAFT_ID',
    'kind': 'field',
    'metaSrc': 'source code'}),
  (('com.fsck.k9.provider.EmailProvi

Now I'm going to convert the data (yet again) into "relationships". A relationship R is a set of tuples (A,B) in which A and B are nodes in the graph, and there is an edge of type R from A to B.

First let's see what relationships are there:

In [7]:
rel_types = {typ for (_, _, typ) in edges}
rel_types

{'contains',
 'hasParameter',
 'hasScript',
 'hasVariable',
 'instantiates',
 'invokes',
 'returnType',
 'specializes',
 'type'}

This is the real conversion:

In [8]:
detailed_rels = {rel_type:{(source,target) for (source, target, typ) in edges if typ == rel_type} for rel_type in rel_types}

[(rel_type, len(detailed_rels[rel_type])) for rel_type in detailed_rels]

[('contains', 925),
 ('type', 5238),
 ('hasParameter', 5132),
 ('hasVariable', 2984),
 ('specializes', 159),
 ('invokes', 6266),
 ('returnType', 1983),
 ('instantiates', 721),
 ('hasScript', 4653)]

Some samples of the relationships we get from the dataset:

In [9]:
{rel_type:random.sample(rel,2) for rel_type, rel in detailed_rels.items()}

{'contains': [('com.fsck.k9.activity.setup.OpenPgpAppSelectDialog',
   'com.fsck.k9.activity.setup.OpenPgpAppSelectDialog$ApgDeprecationDialogFragment'),
  ('com.fsck.k9.notification', 'com.fsck.k9.notification.NotificationIds')],
 'type': [('com.fsck.k9.preferences.TimePickerPreference.defaultValue',
   'java.lang.String'),
  ('com.fsck.k9.fragment.ConfirmationDialogFragment.newInstance(int,java.lang.String,java.lang.String,java.lang.String,java.lang.String).title',
   'java.lang.String')],
 'hasParameter': [('com.fsck.k9.message.MessageBuilder.setAttachments(java.util.List)',
   'com.fsck.k9.message.MessageBuilder.setAttachments(java.util.List).attachments'),
  ('com.fsck.k9.notification.NotificationActionCreator.createViewMessagePendingIntent(com.fsck.k9.activity.MessageReference,int)',
   'com.fsck.k9.notification.NotificationActionCreator.createViewMessagePendingIntent(com.fsck.k9.activity.MessageReference,int).messageReference')],
 'hasVariable': [('com.fsck.k9.activity.MessageIn

I'm going to generate the abstract graph from the detailed graph above.

For the nodes, I'm going to only take the packages (Containers) and classes (Structures).

In [10]:
containers = {id for (id,label) in nodes if label == 'Container'}
structures = {id for (id,label) in nodes if label == 'Structure'}
random.sample(containers,5), random.sample(structures,5)

(['com.fsck.k9.fragment',
  'com.fsck.k9.mailstore.migrations',
  'com.fsck.k9.view',
  'com.fsck.k9.service',
  'com.fsck.k9.power'],
 ['com.fsck.k9.mailstore.AttachmentViewInfo',
  'com.fsck.k9.mailstore.StorageManager',
  'com.fsck.k9.preferences.Settings$V',
  'com.fsck.k9.service.ShutdownReceiver',
  'com.fsck.k9.ui.compose.QuotedMessagePresenter'])

For the edges, I'm using the transitive properties, e.g., the relationship C1 "calls" C2 means: the class C1 "hasScript" S1, S1 "invokes" the script S2, and the class C2 "hasScript" S2.

In [11]:
abstract_rels = dict()

abstract_rels['contains'] = detailed_rels['contains']
abstract_rels['specializes'] = detailed_rels['specializes']
abstract_rels['holds'] = transitive_closure(detailed_rels['hasVariable'], detailed_rels['type'])
abstract_rels['accepts'] = transitive_closure(transitive_closure(detailed_rels['hasScript'], detailed_rels['hasParameter']), detailed_rels['type'])
abstract_rels['returns'] = transitive_closure(detailed_rels['hasScript'], detailed_rels['returnType'])
abstract_rels['constructs'] = transitive_closure(detailed_rels['hasScript'], detailed_rels['instantiates'])
abstract_rels['calls'] = transitive_closure(transitive_closure(detailed_rels['hasScript'], detailed_rels['invokes']), flip_tuples(detailed_rels['hasScript']))

[(rel_type, len(abstract_rels[rel_type])) for rel_type in abstract_rels]

[('contains', 925),
 ('specializes', 159),
 ('holds', 848),
 ('accepts', 1107),
 ('returns', 1129),
 ('constructs', 502),
 ('calls', 1351)]

Some samples of the abstracted relationships:

In [12]:
{rel_type:random.sample(rel,2) for rel_type, rel in abstract_rels.items()}

{'contains': [('com.fsck.k9.activity.Accounts',
   'com.fsck.k9.activity.Accounts$ExportAsyncTask'),
  ('com.fsck.k9.service.SleepService',
   'com.fsck.k9.service.SleepService$SleepDatum')],
 'specializes': [('com.fsck.k9.activity.Accounts$ImportSelectionDialog',
   'com.fsck.k9.activity.misc.NonConfigurationInstance'),
  ('com.fsck.k9.activity.ChooseAccount', 'com.fsck.k9.activity.AccountList')],
 'holds': [('com.fsck.k9.service.PollService', 'java.lang.String'),
  ('com.fsck.k9.ui.messageview.MessageViewFragment',
   'com.fsck.k9.activity.MessageLoaderHelper$MessageLoaderCallbacks')],
 'accepts': [('com.fsck.k9.preferences.SettingsImporter',
   'com.fsck.k9.preferences.Storage'),
  ('com.fsck.k9.preferences.IdentitySettings$SignatureSetting',
   'java.lang.String')],
 'returns': [('com.fsck.k9.message.signature.HtmlSignatureRemover$StripSignatureFilter',
   'com.fsck.k9.message.signature.HtmlSignatureRemover$StripSignatureFilter'),
  ('com.fsck.k9.activity.compose.IdentityAdapter$Id

Now moving on to an example of call lifting; for that we focus on relationships "contains" and "calls".

In [13]:
random.sample(abstract_rels['contains'], 10)

[('com.fsck.k9.message.extractors',
  'com.fsck.k9.message.extractors.AttachmentCounter'),
 ('com.fsck.k9.mailstore', 'com.fsck.k9.mailstore.MessageHelper'),
 ('com.fsck.k9.mailstore', 'com.fsck.k9.mailstore.LocalFolder'),
 ('com.fsck.k9.activity.setup', 'com.fsck.k9.activity.setup.Prefs'),
 ('com.fsck.k9.activity', 'com.fsck.k9.activity.ColorPickerDialog'),
 ('com.fsck.k9.message', 'com.fsck.k9.message.PgpMessageBuilder'),
 ('com.fsck.k9', 'com.fsck.k9.Globals'),
 ('com.fsck.k9.provider.EmailProvider',
  'com.fsck.k9.provider.EmailProvider$MessageColumns'),
 ('com.fsck.k9', 'com.fsck.k9.Throttle$MyTimerTask$HandlerRunnable'),
 ('com.fsck.k9.preferences',
  'com.fsck.k9.preferences.GlobalSettings$TimeSetting')]

In [14]:
random.sample(abstract_rels['calls'], 10)

[('com.fsck.k9.activity.setup.FontSizeSettings',
  'com.fsck.k9.preferences.StorageEditor'),
 ('com.fsck.k9.mailstore.migrations.Migrations',
  'com.fsck.k9.mailstore.migrations.MigrationTo54'),
 ('com.fsck.k9.helper.K9AlarmManager', 'com.fsck.k9.helper.K9AlarmManager'),
 ('com.fsck.k9.activity.compose.RecipientLoader',
  'com.fsck.k9.activity.compose.RecipientLoader'),
 ('com.fsck.k9.preferences.GlobalSettings$DirectorySetting',
  'com.fsck.k9.preferences.Settings$SettingsDescription'),
 ('com.fsck.k9.message.extractors.MessageFulltextCreator',
  'com.fsck.k9.message.extractors.EncryptionDetector'),
 ('com.fsck.k9.mailstore.migrations.Migrations',
  'com.fsck.k9.mailstore.migrations.MigrationTo34'),
 ('com.fsck.k9.helper.UnreadWidgetProperties', 'com.fsck.k9.Preferences'),
 ('com.fsck.k9.provider.DecryptedFileProvider$DecryptedFileProviderCleanupReceiver',
  'com.fsck.k9.provider.DecryptedFileProvider'),
 ('com.fsck.k9.activity.loader.AttachmentInfoLoader',
  'com.fsck.k9.activity.mis

Here's the code to lift the calls.

In [15]:
# example: lift calls

lifted_calls = transitive_closure(
	transitive_closure(
		abstract_rels['contains'], 
		abstract_rels['calls']), 
	flip_tuples(abstract_rels['contains']))
lifted_calls

{('com.fsck.k9', 'com.fsck.k9'),
 ('com.fsck.k9', 'com.fsck.k9.Account'),
 ('com.fsck.k9', 'com.fsck.k9.K9'),
 ('com.fsck.k9', 'com.fsck.k9.PRNGFixes'),
 ('com.fsck.k9', 'com.fsck.k9.Throttle'),
 ('com.fsck.k9', 'com.fsck.k9.controller'),
 ('com.fsck.k9', 'com.fsck.k9.helper'),
 ('com.fsck.k9', 'com.fsck.k9.mailstore'),
 ('com.fsck.k9', 'com.fsck.k9.power'),
 ('com.fsck.k9', 'com.fsck.k9.preferences'),
 ('com.fsck.k9', 'com.fsck.k9.preferences.GlobalSettings'),
 ('com.fsck.k9', 'com.fsck.k9.provider'),
 ('com.fsck.k9', 'com.fsck.k9.search'),
 ('com.fsck.k9', 'com.fsck.k9.service'),
 ('com.fsck.k9', 'com.fsck.k9.widget.list'),
 ('com.fsck.k9.PRNGFixes', 'com.fsck.k9'),
 ('com.fsck.k9.PRNGFixes', 'com.fsck.k9.PRNGFixes'),
 ('com.fsck.k9.activity', 'com.fsck.k9'),
 ('com.fsck.k9.activity', 'com.fsck.k9.activity'),
 ('com.fsck.k9.activity', 'com.fsck.k9.activity.Accounts'),
 ('com.fsck.k9.activity', 'com.fsck.k9.activity.AlternateRecipientAdapter'),
 ('com.fsck.k9.activity', 'com.fsck.k9.a

In [16]:
def find_nodes(nodes, id):
	return [(id_,label) for (id_,label) in nodes if id_ == id]

## networkx Stuff

In [None]:
import networkx as nx

g = nx.MultiDiGraph()

for (id, label), properties in nodes.items():
    g.add_node(id, label=label, **properties)

for (source, target, typ), properties in edges.items():
    g.add_edge(source, target, type=typ, **properties)

g.number_of_nodes(), g.number_of_edges()

In [None]:
for node, nbrsdict in list(g.adj.items())[:10]:
    print((node, nbrsdict))
    

In [None]:
method_sources = {id:properties['sourceText'] for (id,label),properties in nodes.items() if label in ['Operation', 'Constructor']}
method_sources

## LLM Stuff

In [None]:
from txtai.pipeline import LLM

# llm = LLM("Open-Orca/Mistral-7B-OpenOrca")
llm = LLM("microsoft/phi-2", trust_remote_code='True')
# llm = LLM("lmsys/vicuna-7b-v1.5")
for method_name, method_src in random.sample(method_sources.items(), 10):
	llm(
	f"""
	Answer the following question using the provided context.

	Question:
	What does this Java method do?

	Context:
	{method_src}
	"""
	)