In [1]:
from dataclasses import dataclass
import json
from pprint import pprint
from scipy import spatial

from glob import glob
from openai import OpenAI, embeddings

from python.mutater import DESTDIR

OPENAI_API_KEY=""
client = OpenAI(api_key=OPENAI_API_KEY)
MODEL = "text-embedding-ada-002"


In [2]:
mutation_files = glob(f"{DESTDIR}/*.json")
def get_embedding(text, model=MODEL):
   text = text.replace("\n", " ")
   response = client.embeddings.create(input = [text], model=model)
   return response.data[0].embedding
   
@dataclass
class FuncEmbeddingRecord():
   func_name: str
   original_str: str
   mutated_embeddings: list[list[float]] 
   mutated_str: list[str]

# Observations

- The two largest functions are the closest to what our mutations would be like and their cosine similarity is 0.68
- For the largest function the largest and smallest similarity is 0.85
- For the smallest and largest the similarity is 0.92 to 0.82

# User specified similarity

In [3]:
orig = '''
def main():
   for i in range(10):
      print(i)
   return 1
'''

mut = '''
def main():
   for i in range(10):
      if False:
         # 1
         i = 10
         # 2
         while(True):
            print("Mutation")
         # 3
         n = 10
         # Compute nth fibbonacci number
         dp = [0,1]
         for i in range(2,n+1):
            dp.append(dp[i-1] + dp[i-2])
         return dp[n]
         # 4
         def dfs(node):
            if node == None:
                  return []
            left = dfs(node.left)
            right = dfs(node.right)
            return left + [node.val] + right
         return dfs(root)
         # 5
         length = 15
         if length <= 0:
            return []
         elif length == 1:
            return [0]

         sequence = [0, 1]
         while len(sequence) < length:
            next_value = sequence[-1] + sequence[-2]
            sequence.append(next_value)

         return sequence
      print(i)
   return 1
'''

# file_path = mutation_files[0]
# with open(file_path, "r") as f:
#     func_records = json.load(f)
#     rec = func_records[0]
orig_embedding = get_embedding(orig)
mut_embedding = get_embedding(mut)
print(orig_embedding)
print(mut_embedding)
print(1-spatial.distance.cosine(orig_embedding, mut_embedding))

[-0.007512002717703581, -0.008224775083363056, -0.005878823809325695, -0.005193942226469517, 0.0071525173261761665, 0.015755372121930122, -0.033097438514232635, 0.0004141053359489888, -0.016970183700323105, -0.0035855555906891823, 0.012476618401706219, 0.012303073890507221, 0.011838221922516823, 0.020193155854940414, 0.006898398511111736, -0.0013511688448488712, -0.004893338307738304, -0.0035483676474541426, 0.02382519654929638, -0.0017617876874282956, 0.000731753942091018, -0.0013318000128492713, -0.013362935744225979, 0.010834142565727234, -0.008832180872559547, 0.03002321906387806, 0.017540402710437775, -0.00578585360199213, 0.020887333899736404, -0.03027113899588585, 0.024383017793297768, -0.004003921989351511, -0.00774132926017046, -0.02724650502204895, 0.022399650886654854, 0.017788322642445564, -0.016387570649385452, -0.004626823589205742, 0.02130880020558834, -0.015817351639270782, 0.009501568041741848, -0.0008971636998467147, -0.0060368734411895275, -0.014119094237685204, -0.0

# Similarity of the two smallest functions

In [4]:
import heapq
min_heap = []
for file_path in mutation_files:
    with open(file_path, "r") as f:
        func_records = json.load(f)
        for rec in func_records:
            heapq.heappush(min_heap, (len(rec["original"]), rec["original"], rec["mutated"]))


In [5]:
smallest_embedding = get_embedding(min_heap[0][1])
second_smallest_embedding = get_embedding(min_heap[1][1])

print(1-spatial.distance.cosine(smallest_embedding, second_smallest_embedding))

0.8893284623067418


# Similarity of the two largest functions

In [7]:
import heapq
max_heap = []
longest = 0
for file_path in mutation_files:
    with open(file_path, "r") as f:
        func_records = json.load(f)
        for rec in func_records:
            longest = max(len(rec["original"]), longest)
            heapq.heappush(max_heap, (-len(rec["original"]), rec["original"], rec["mutated"]))
while max_heap[0][0] < -8192:
    heapq.heappop(max_heap)


In [8]:
largest_embedding = get_embedding(max_heap[0][1])
second_largest_embedding = get_embedding(max_heap[1][1])

print(max_heap[0][1])
print(max_heap[1][1])

print(1-spatial.distance.cosine(largest_embedding, second_largest_embedding))

def check_alignment_rfam1(self, alignment):
    """Check the alignment obtained by parsing Rfam record BTnc005."""
    self.assertEqual(alignment.annotations['accession'], 'RF04178')
    self.assertEqual(alignment.annotations['identifier'], 'BTnc005')
    self.assertEqual(alignment.annotations['definition'], 'Bacteroides sRNA BTnc005')
    self.assertEqual(alignment.annotations['author'], ['Prezza, G', 'Ryan, D', 'Mädler, G', 'Barquist, L; 0000-0003-4732-2667', 'Westermann, A'])
    self.assertEqual(alignment.annotations['source of seed'], 'Published; PMID:32678091;')
    self.assertEqual(alignment.annotations['source of structure'], 'Published; PMID:32678091;')
    self.assertEqual(alignment.annotations['gathering method'], '174.80')
    self.assertEqual(alignment.annotations['trusted cutoff'], '179.30')
    self.assertEqual(alignment.annotations['noise cutoff'], '174.30')
    self.assertEqual(alignment.annotations['type'], 'Gene; sRNA;')
    self.assertEqual(alignment.annotations['bu

# Smallest function: Similarity with mutations

In [9]:
for mut in min_heap[0][2]:
    mut_embedding = get_embedding(mut)
    print(mut)
    print(1-spatial.distance.cosine(smallest_embedding, mut_embedding))

def h():
    if False:
        i = 10
    c
0.9217372162453384
def h():
    if False:
        i = 10
        while True:
            print('Mutation')
    c
0.8681091750245917
def h():
    if False:
        i = 10
        while True:
            print('Mutation')
        n = 10
        dp = [0, 1]
        for i in range(2, n + 1):
            dp.append(dp[i - 1] + dp[i - 2])
        print(dp[n])
    c
0.846585995911853
def h():
    if False:
        i = 10
        while True:
            print('Mutation')
        n = 10
        dp = [0, 1]
        for i in range(2, n + 1):
            dp.append(dp[i - 1] + dp[i - 2])
        print(dp[n])

        def dfs(node):
            if node == None:
                return []
            left = dfs(node.left)
            right = dfs(node.right)
    c
0.8344014595089915
def h():
    if False:
        i = 10
        while True:
            print('Mutation')
        dp = [0, 1]
        for i in range(2, n + 1):
            dp.append(dp[i - 1] + dp[i

# Largest function: Similarity with mutations

In [10]:
for mut in max_heap[0][2]:
    mut_embedding = get_embedding(mut)
    print(mut)
    print(1-spatial.distance.cosine(largest_embedding, mut_embedding))

def check_alignment_rfam1(self, alignment):
    if False:
        i = 10
    'Check the alignment obtained by parsing Rfam record BTnc005.'
    self.assertEqual(alignment.annotations['accession'], 'RF04178')
    self.assertEqual(alignment.annotations['identifier'], 'BTnc005')
    self.assertEqual(alignment.annotations['definition'], 'Bacteroides sRNA BTnc005')
    self.assertEqual(alignment.annotations['author'], ['Prezza, G', 'Ryan, D', 'Mädler, G', 'Barquist, L; 0000-0003-4732-2667', 'Westermann, A'])
    self.assertEqual(alignment.annotations['source of seed'], 'Published; PMID:32678091;')
    self.assertEqual(alignment.annotations['source of structure'], 'Published; PMID:32678091;')
    self.assertEqual(alignment.annotations['gathering method'], '174.80')
    self.assertEqual(alignment.annotations['trusted cutoff'], '179.30')
    self.assertEqual(alignment.annotations['noise cutoff'], '174.30')
    self.assertEqual(alignment.annotations['type'], 'Gene; sRNA;')
    self.assertEqual(