-
Notifications
You must be signed in to change notification settings - Fork 3
/
extract_gpt3_outputs.py
59 lines (50 loc) · 1.73 KB
/
extract_gpt3_outputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import json
from extract_text import extract_python_code, extract_text_from_comments_and_strings
import re
import os
from pathlib import Path
def extract_python_code(text, key):
# this is version for GPT4 where
# all outputs have a code block
lines = text.split('\n')
python_code = []
record = False
for line in lines:
# check for key
if key in line:
continue
if not record and line.startswith('```'):
record = True
continue
# deal with various unmarked endings
if record and line.startswith('```'):
break
if record:
python_code.append(line)
if len(python_code) == 0:
return lines, False
return python_code, True
if __name__ == "__main__":
gpt3_files = list(Path('gpt3_rawoutputs').glob('*.txt'))
print(f'Loaded {len(gpt3_files)} recoded files')
outdir = 'gpt3_code'
os.makedirs(outdir, exist_ok=True)
with open('github_code_for_recoding_info.json', 'r') as f:
github_info = json.load(f)
items = {}
for file in gpt3_files:
with open(file, 'r') as f:
text = f.read()
key = file.stem
filename = github_info[key]['filename'].replace(
'github_code', outdir)
# this was missed in the earlier filter, so we remove them here
if 'Generated by Django' in text:
print('Skipping Django file')
continue
code, succeeded = extract_python_code(text, key)
if not succeeded:
print(f'No code found for {filename} {key}, returning full output')
assert filename != github_info[key]['filename']
with open(filename, 'w') as f:
f.write('\n'.join(code))