In [1]:
from rouge import Rouge
import json
import os
import pandas as pd
from radon.complexity import cc_visit

# Test radon

In [2]:
from radon.visitors import ComplexityVisitor
v = ComplexityVisitor.from_code('''
def factorial(n):
    if n < 2: return 1
    return n * factorial(n - 1)

def foo(bar):
    return sum(i for i in range(bar ** 2) if bar % i)
''')
v.functions
for f in v.functions:
    print(f, f.complexity)

F 2:0->4 factorial - 2 2
F 6:0->7 foo - 3 3


In [3]:
v.functions[0].complexity

2

# Read files

In [6]:
# read the train/valid/test data
# DATASET = 'valid'
DATASET = 'train'

current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)
data_file = os.path.join(root_dir, "parallel-code-docstring", f"restored_data.{DATASET}.jsonl")
data_file

'/home/v-haotiancui/NL2Code/Copilot-2/dataset/parallel-code-docstring/restored_data.train.jsonl'

In [7]:
with open(data_file, "r") as f:
    data = [json.loads(line) for line in f]

codes = [d["code"] for d in data]
descs = [d["desc"] for d in data]
for i, desc in enumerate(descs):
    if len(desc) <= 0:
        descs[i] = "NA"
assert len(codes) == len(descs)

In [11]:
print(f"num of examples in {DATASET} set: {len(codes)}")
print(codes[0])

num of examples in train set: 109108
def get_flashed_messages(with_categories=False, category_filter=[]): 
    flashes = _request_ctx_stack.top.flashes 
   if (flashes is None): 
      _request_ctx_stack.top.flashes = flashes = (session.pop('_flashes') if ('_flashes' in session) else []) 
   if category_filter: 
      flashes = list(filter((lambda f: (f[0] in category_filter)), flashes)) 
   if (not with_categories): 
      return [x[1] for x in flashes] 
   return flashes


# Filer length and complexity

In [12]:
# filter out docstrings with params
def remove_param(desc):
    lines = desc.split("\n")
    new_lines = [line for line in lines if ":param" not in line]
    new_lines = [line for line in new_lines if ":arg" not in line]
    new_lines = [line for line in new_lines if not line.startswith(":")]
    new_lines = [line for line in new_lines if not line.startswith("@")]
    return new_lines

In [13]:
long_code = []
long_desc = []
complexities = []
for i, code in enumerate(codes):
    d = descs[i]
    d = d[:d.find('>>>')] if d.find('>>>') >= 0 else d
    if len(remove_param(d)) >= 4 and len(code.split("\n")) >= 6 and len(code.split("\n")) <= 30:
        # complexity messure
        code_ = code[code.find('def'):]
        idx = code_.find('\n')
        code_ = code_[:idx+1] + code_[idx+2:] # a temporary fix
        try :
            v = cc_visit(code_)
            if v[0].complexity > 3:
                print(i)
                long_code.append(code_)
                long_desc.append(d)
                complexities.append(v[0].complexity)
        except:
            continue

0
5
8
12
21
27
80
81
85
116
125
128
138
156
184
189
192
195
200
201
215
220
225
231
235
238
255
256
257
274
279
285
292
308
310
317
318
330
333
337
339
344
361
363
372
378
382
384
385
388
390
410
411
412
413
430
432
441
446
447
451
463
469
475
495
504
526
530
538
545
569
570
575
594
600
609
624
629
631
632
633
652
674
686
698
716
718
723
725
727
732
734
739
749
758
766
768
774
778
781
787
788
792
794
797
798
804
818
821
824
827
841
844
857
861
864
869
872
881
907
909
911
915
927
936
937
938
951
952
957
961
968
971
994
1005
1008
1026
1028
1032
1047
1048
1054
1057
1072
1078
1079
1081
1084
1090
1102
1143
1150
1153
1156
1161
1166
1171
1194
1203
1215
1225
1233
1243
1263
1268
1271
1272
1306
1317
1322
1336
1344
1356
1357
1361
1364
1371
1387
1389
1395
1405
1413
1424
1434
1437
1444
1463
1465
1469
1471
1476
1479
1485
1486
1507
1509
1516
1517
1524
1549
1567
1568
1581
1582
1587
1588
1591
1596
1598
1608
1610
1618
1620
1622
1630
1631
1632
1642
1643
1648
1654
1660
1671
1672
1685
1691
1698
1705
1706
1

In [14]:
print(f"num of examples in long code set: {len(long_code)}")
assert len(long_code) == len(long_desc)

num of examples in long code set: 13437


# Store data in csv

In [15]:
rouge = Rouge()
rouge_score = rouge.get_scores(long_desc, long_code)

# %%
rouge1r = [d["rouge-1"]["r"] for d in rouge_score]
rouge1f = [d["rouge-1"]["f"] for d in rouge_score]

In [16]:
df = pd.DataFrame(
    {
        "description": long_desc,
        "reference_code": long_code,
        "complexity": complexities,
        "rouge-1-r": rouge1r,
        "rouge-1-f": rouge1f
    }
)
df.to_csv(os.path.join(root_dir, "long_code_desc.train.csv"), index=True)

In [17]:
len(long_code)

13437

# Store in json for PyMT5

In [15]:
contents = []
for i in range(len(long_code)):
    contents.append(
        {
            "nl":  '# target docstring style numpydoc\n' + long_code[i],
            "code": long_desc[i],
            "url": "na",
        }
    )
output_file = os.path.join(root_dir, 
    "code2text/DeepDiv/raw/code-docstring-corpus/test.json")
with open(output_file, 'w') as f:
    json.dump(contents, f)

FileNotFoundError: [Errno 2] No such file or directory: 'code2text/DeepDiv/raw/code-docstring-corpus/test.json'

# Running Examples

In [None]:
example1 = """def shorten_paths(path_list, is_unsaved): 
   path_list = path_list[:] 
   new_path_list = [] 
   for (ii, (path, is_unsav)) in enumerate(zip(path_list, is_unsaved)): 
      if is_unsav: 
         new_path_list.append(_('unsaved   file')) 
         path_list[ii] = None 
      else: 
         (drive, path) = osp.splitdrive(osp.dirname(path)) 
         new_path_list.append((drive + osp.sep)) 
         path_list[ii] = [part for part in path.split(osp.sep) if part] 
   def recurse_level(level_idx): 
      sep = os.sep 
      if (not any(level_idx.values())): 
         return 
      sample_toks = list(level_idx.values())[0] 
      if (not sample_toks): 
         s = 0 
      else: 
         for (s, sample_val) in enumerate(sample_toks): 
            if (not all((((len(toks) > s) and (toks[s] == sample_val)) for toks in level_idx.values()))): 
               break 
      if (s == 0): 
         short_form = '' 
      else: 
         if (s == 1): 
            short_form = sample_toks[0] 
         elif (s == 2): 
            short_form = ((sample_toks[0] + sep) + sample_toks[1]) 
         else: 
            short_form = (('...' + sep) + sample_toks[(s - 1)]) 
         for idx in level_idx: 
            new_path_list[idx] += (short_form + sep) 
            level_idx[idx] = level_idx[idx][s:] 
      while level_idx: 
         (k, group) = (0, level_idx) 
         while True: 
            prospective_group = {idx: toks for (idx, toks) in group.items() if (len(toks) == k)} 
            if prospective_group: 
               if (k == 0): 
                  group = prospective_group 
               break 
            (_, sample_toks) = next(iteritems(group)) 
            prospective_group = {idx: toks for (idx, toks) in group.items() if (toks[k] == sample_toks[k])} 
            if ((len(prospective_group) == len(group)) or (k == 0)): 
               group = prospective_group 
               k += 1 
            else: 
               break 
         (_, sample_toks) = next(iteritems(group)) 
         if (k == 0): 
            short_form = '' 
         elif (k == 1): 
            short_form = sample_toks[0] 
         elif (k == 2): 
            short_form = ((sample_toks[0] + sep) + sample_toks[1]) 
         else: 
            short_form = (((sample_toks[0] + '...') + sep) + sample_toks[(k - 1)]) 
         for idx in group.keys(): 
            new_path_list[idx] += (short_form + (sep if (k > 0) else '')) 
            del level_idx[idx] 
         recurse_level({idx: toks[k:] for (idx, toks) in group.items()}) 
   recurse_level({i: pl for (i, pl) in enumerate(path_list) if pl}) 
   return [path.rstrip(os.sep) for path in new_path_list]
"""
example1_desc = """Takes a list of paths and tries to ""intelligently"" shorten them all. The 
 aim is to make it clear to the user where the paths differ, as that is 
 likely what they care about. Note that this operates on a list of paths 
 not on individual paths. 
 If the path ends in an actual file name, it will be trimmed off."""

In [None]:
"""def build_title(title_dict, canonical=None, canonicalSeries=None, canonicalEpisode=None, ptdf=0, lang=None, _doYear=1, _emptyString=u'', appendKind=True): 
   if (canonical is not None): 
      canonicalSeries = canonical 
   pre_title = _emptyString 
   kind = title_dict.get('kind') 
   episode_of = title_dict.get('episode   of') 
   if ((kind == 'episode') and (episode_of is not None)): 
      doYear = 0 
      if ptdf: 
         doYear = 1 
      if (not isinstance(episode_of, (dict, _Container))): 
         episode_of = {'title': episode_of, 'kind': 'tv   series'} 
         if ('series   year' in title_dict): 
            episode_of['year'] = title_dict['series   year'] 
      pre_title = build_title(episode_of, canonical=canonicalSeries, ptdf=0, _doYear=doYear, _emptyString=_emptyString) 
      ep_dict = {'title': title_dict.get('title', ''), 'imdbIndex': title_dict.get('imdbIndex')} 
      ep_title = ep_dict['title'] 
      if (not ptdf): 
         doYear = 1 
         ep_dict['year'] = title_dict.get('year', '????') 
         if ((ep_title[0:1] == '(') and (ep_title[(-1):] == ')') and ep_title[1:5].isdigit()): 
            ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=1, _emptyString=_emptyString) 
      else: 
         doYear = 0 
         if ep_title.startswith('Episode   dated'): 
            ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=0, _emptyString=_emptyString) 
      episode_title = build_title(ep_dict, canonical=canonicalEpisode, ptdf=ptdf, _doYear=doYear, _emptyString=_emptyString) 
      if ptdf: 
         oad = title_dict.get('original   air   date', _emptyString) 
         if ((len(oad) == 10) and (oad[4] == '-') and (oad[7] == '-') and (episode_title.find(oad) == (-1))): 
            episode_title += ('   (%s)' % oad) 
         seas = title_dict.get('season') 
         if (seas is not None): 
            episode_title += ('   (#%s' % seas) 
            episode = title_dict.get('episode') 
            if (episode is not None): 
               episode_title += ('.%s' % episode) 
            episode_title += ')' 
         episode_title = ('{%s}' % episode_title) 
      return (_emptyString + ('%s   %s' % ((_emptyString + pre_title), (_emptyString + episode_title)))) 
   title = title_dict.get('title', '') 
   imdbIndex = title_dict.get('imdbIndex', '') 
   if (not title): 
      return _emptyString 
   if (canonical is not None): 
      if canonical: 
         title = canonicalTitle(title, lang=lang, imdbIndex=imdbIndex) 
      else: 
         title = normalizeTitle(title, lang=lang) 
   if pre_title: 
      title = ('%s   %s' % (pre_title, title)) 
   if (kind in (u'tv   series', u'tv   mini   series')): 
      title = ('""%s""' % title) 
   if _doYear: 
      year = (title_dict.get('year') or '????') 
      if isinstance(_emptyString, str): 
         year = str(year) 
      imdbIndex = title_dict.get('imdbIndex') 
      if (not ptdf): 
         if (imdbIndex and ((canonical is None) or canonical)): 
            title += ('   (%s)' % imdbIndex) 
         title += ('   (%s)' % year) 
      else: 
         title += ('   (%s' % year) 
         if (imdbIndex and ((canonical is None) or canonical)): 
            title += ('/%s' % imdbIndex) 
         title += ')' 
   if (appendKind and kind): 
      if (kind == 'tv   movie'): 
         title += '   (TV)' 
      elif (kind == 'video   movie'): 
         title += '   (V)' 
      elif (kind == 'tv   mini   series'): 
         title += '   (mini)' 
      elif (kind == 'video   game'): 
         title += '   (VG)' 
   return title"""


"""Given a dictionary that represents a ""long"" IMDb title, 
 return a string. 
 If canonical is None (default), the title is returned in the stored style. 
 If canonical is True, the title is converted to canonical style. 
 If canonical is False, the title is converted to normal format. 
 lang can be used to specify the language of the title. 
 If ptdf is true, the plain text data files format is used."""

'Given a dictionary that represents a ""long"" IMDb title, \n return a string. \n If canonical is None (default), the title is returned in the stored style. \n If canonical is True, the title is converted to canonical style. \n If canonical is False, the title is converted to normal format. \n lang can be used to specify the language of the title. \n If ptdf is true, the plain text data files format is used.'

In [None]:
"""def nC(n, k=None, replacement=False): 
   from sympy.functions.combinatorial.factorials import binomial 
   from sympy.core.mul import prod 
   if isinstance(n, SYMPY_INTS): 
      if (k is None): 
         if (not replacement): 
            return (2 ** n) 
         return sum((nC(n, i, replacement) for i in range((n + 1)))) 
      if (k < 0): 
         raise ValueError('k   cannot   be   negative') 
      if replacement: 
         return binomial(((n + k) - 1), k) 
      return binomial(n, k) 
   if isinstance(n, _MultisetHistogram): 
      N = n[_N] 
      if (k is None): 
         if (not replacement): 
            return prod(((m + 1) for m in n[_M])) 
         return sum((nC(n, i, replacement) for i in range((N + 1)))) 
      elif replacement: 
         return nC(n[_ITEMS], k, replacement) 
      elif (k in (1, (N - 1))): 
         return n[_ITEMS] 
      elif (k in (0, N)): 
         return 1 
      return _AOP_product(tuple(n[_M]))[k] 
   else: 
      return nC(_multiset_histogram(n), k, replacement)"""


"""Return the number of combinations of ``n`` items taken ``k`` at a time. 
 Possible values for ``n``:: 
 integer - set of length ``n`` 
 sequence - converted to a multiset internally 
 multiset - {element: multiplicity} 
 If ``k`` is None then the total of all combinations of length 0 
 through the number of items represented in ``n`` will be returned. 
 If ``replacement`` is True then a given item can appear more than once 
 in the ``k`` items. (For example, for \'ab\' sets of 2 would include \'aa\', 
 \'ab\', and \'bb\'.) The multiplicity of elements in ``n`` is ignored when 
 ``replacement`` is True but the total number of elements is considered 
 since no element can appear more times than the number of elements in 
 ``n``. 
 Examples 
 """

"Return the number of combinations of ``n`` items taken ``k`` at a time. \n Possible values for ``n``:: \n integer - set of length ``n`` \n sequence - converted to a multiset internally \n multiset - {element: multiplicity} \n If ``k`` is None then the total of all combinations of length 0 \n through the number of items represented in ``n`` will be returned. \n If ``replacement`` is True then a given item can appear more than once \n in the ``k`` items. (For example, for 'ab' sets of 2 would include 'aa', \n 'ab', and 'bb'.) The multiplicity of elements in ``n`` is ignored when \n ``replacement`` is True but the total number of elements is considered \n since no element can appear more times than the number of elements in \n ``n``. \n Examples \n "