In [1]:
from rouge import Rouge
import json
import os
import pandas as pd
from radon.complexity import cc_visit

# Test radon

In [1]:
from radon.visitors import ComplexityVisitor
v = ComplexityVisitor.from_code('''
def factorial(n):
    if n < 2: return 1
    return n * factorial(n - 1)

def foo(bar):
    return sum(i for i in range(bar ** 2) if bar % i)
''')
v.functions
for f in v.functions:
    print(f, f.complexity)

F 2:0->4 factorial - 2 2
F 6:0->7 foo - 3 3


# Import codesearchnet from huggingface dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("code_search_net", "python")

Downloading:   0%|          | 0.00/2.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading and preparing dataset code_search_net/python (download: 897.32 MiB, generated: 1.62 GiB, post-processed: Unknown size, total: 2.49 GiB) to /home/v-haotiancui/.cache/huggingface/datasets/code_search_net/python/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27...


Downloading:   0%|          | 0.00/941M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset code_search_net downloaded and prepared to /home/v-haotiancui/.cache/huggingface/datasets/code_search_net/python/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27. Subsequent calls will reuse this data.


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [12]:
print(
    dataset["train"]["func_code_string"][0], 
    "\n\n",
    dataset["train"]["func_documentation_string"][0]
    )

def findArgs(args, prefixes):
		"""
		Extracts the list of arguments that start with any of the specified prefix values
		"""
		return list([
			arg for arg in args
			if len([p for p in prefixes if arg.lower().startswith(p.lower())]) > 0
		]) 

 Extracts the list of arguments that start with any of the specified prefix values


# Load from raw dataset

In [16]:
current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)
data_dir = "/home/v-haotiancui/NL2Code/CodeSearchNet/resources/data/python/final/jsonl/train"

In [6]:
# load jsonl files
files = os.listdir(data_dir)
assert len(files) == 14
data = []
for i in range(14):
    file = f"python_train_{i}.jsonl"
    print(f"loading file {file} ...")
    with open(os.path.join(data_dir, file), "r") as f:
        for line in f:
            data.append(json.loads(line))

loading file python_train_0.jsonl ...
loading file python_train_1.jsonl ...
loading file python_train_2.jsonl ...
loading file python_train_3.jsonl ...
loading file python_train_4.jsonl ...
loading file python_train_5.jsonl ...
loading file python_train_6.jsonl ...
loading file python_train_7.jsonl ...
loading file python_train_8.jsonl ...
loading file python_train_9.jsonl ...
loading file python_train_10.jsonl ...
loading file python_train_11.jsonl ...
loading file python_train_12.jsonl ...
loading file python_train_13.jsonl ...


In [7]:
len(data)

412178

In [9]:
codes = [d["code"] for d in data]
descs = [d["docstring"] for d in data]
for i, desc in enumerate(descs):
    if len(desc) <= 0:
        descs[i] = "NA"
assert len(codes) == len(descs)

In [12]:
print(f"num of examples in set: {len(codes)}")
print(descs[0])

num of examples in set: 412178
Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that contains a sub-directory for each known person, with its name.

     (View in source code to see train_dir example tree structure)

     Structure:
        <train_dir>/
        ├── <person1>/
        │   ├── <somename1>.jpeg
        │   ├── <somename2>.jpeg
        │   ├── ...
        ├── <person2>/
        │   ├── <somename1>.jpeg
        │   └── <somename2>.jpeg
        └── ...

    :param model_save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree
    :param verbose: verbosity of training
    :return: returns knn classifier that was trained on the given data.


# Filer length and complexity

In [13]:
# filter out docstrings with params
def remove_param(desc):
    lines = desc.split("\n")
    new_lines = [line for line in lines if ":param" not in line]
    new_lines = [line for line in new_lines if ":arg" not in line]
    new_lines = [line for line in new_lines if not line.startswith(":")]
    new_lines = [line for line in new_lines if not line.startswith("@")]
    return new_lines

In [15]:
long_code = []
long_desc = []
complexities = []
for i, code in enumerate(codes):
    d = descs[i]
    d = d[:d.find('>>>')] if d.find('>>>') >= 0 else d
    if len(remove_param(d)) >= 3 and len(code.split("\n")) >= 6 and len(code.split("\n")) <= 30:
        # complexity messure
        code_ = code
        try :
            v = cc_visit(code_)
            if v[0].complexity > 3:
                print(i)
                long_code.append(code_)
                long_desc.append(d)
                complexities.append(v[0].complexity)
        except:
            continue

8
26
29
41
48
61
64
108
125
126
136
160
165
176
208
240
251
278
281
330
426
427
446
467
478
514
536
564
565
599
603
606
622
626
633
634
644
650
662
676
677
688
694
742
745
763
772
779
784
787
792
806
810
831
854
870
875
901
905
913
919
923
925
936
958
968
981
994
999
1035
1047
1077
1090
1097
1133
1150
1154
1168
1250
1259
1264
1274
1294
1297
1304
1328
1340
1345
1377
1396
1398
1401
1422
1438
1449
1450
1453
1458
1479
1492
1498
1499
1501
1511
1513
1517
1518
1519
1521
1528
1532
1541
1567
1571
1614
1626
1635
1650
1654
1658
1673
1679
1682
1687
1710
1713
1714
1721
1725
1732
1739
1756
1758
1763
1769
1770
1773
1797
1873
1875
1877
1880
1896
1930
1992
2004
2009
2020
2022
2024
2025
2032
2098
2105
2108
2142
2157
2166
2175
2195
2202
2206
2222
2246
2258
2259
2273
2278
2311
2331
2353
2362
2379
2388
2749
2796
2810
2811
2841
2912
2916
2918
2937
2965
3037
3152
3175
3179
3180
3183
3185
3186
3190
3191
3213
3220
3260
3261
3271
3274
3278
3279
3280
3293
3297
3305
3311
3315
3318
3319
3324
3331
3332
3338
3339
33

In [17]:
print(f"num of examples in long code set: {len(long_code)}")
assert len(long_code) == len(long_desc)

num of examples in long code set: 37610


# Store data in csv

In [18]:
rouge = Rouge()
rouge_score = rouge.get_scores(long_desc, long_code)

# %%
rouge1r = [d["rouge-1"]["r"] for d in rouge_score]
rouge1f = [d["rouge-1"]["f"] for d in rouge_score]

df = pd.DataFrame(
    {
        "description": long_desc,
        "reference_code": long_code,
        "complexity": complexities,
        "rouge-1-r": rouge1r,
        "rouge-1-f": rouge1f
    }
)
df.to_csv(os.path.join(root_dir, "filtered_codesearchnet.train.csv"), index=True)