In [None]:
!pip install -q datasets transformers accelerate

## Load dataset

In [None]:
from datasets import load_dataset, concatenate_datasets, load_from_disk

In [None]:
dataset = load_dataset("koutch/staqc", 'man_python')

train_test_split = .1
label_human = 0
label_generated = 1


def convert_item(item, label=label_human):
    return {
        'question_id': item['question_id'],
        'question': item['question'],
        'text': item['snippet']['text'][0],
        'label': label
    }


def preprocess_dataset(dataset):
    return dataset.sort('question_id').map(convert_item, remove_columns=['snippet']).shuffle(42618)


def split_dataset(dataset):
    n = len(dataset['train'])
    splitsep = int(n * train_test_split)

    dataset['test'] = dataset['train'].select(range(splitsep))
    dataset['train'] = dataset['train'].select(range(splitsep, n))

    return dataset


dataset = preprocess_dataset(dataset)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['question_id', 'question', 'text', 'label'],
        num_rows: 2052
    })
})


In [None]:
q0 = dataset['train'][0]
q0

{'question_id': 21219262,
 'question': 'Defining the name of a ManyToOne relationship in Django',
 'text': "class SpecificUserProfile(UserProfile):\n    referrer = models.ForeignKey('self', related_name='referred')\n",
 'label': 0}

## Generate dataset

In [None]:
from transformers import pipeline, set_seed, AutoModelForCausalLM, AutoTokenizer

checkpoint = "Salesforce/codegen-350M-mono"
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map="auto", padding_side='left')
print(model.device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cuda:0


In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
generator.tokenizer.pad_token_id = model.config.eos_token_id
set_seed(42618)

In [None]:
n_batch = 8
preproc = lambda s: f'# {s}\n\n'
texts = [preproc(dataset['train'][i]['question']) for i in range(n_batch)]
print(''.join(texts))

# Defining the name of a ManyToOne relationship in Django

# How to parse the header files of the pcap file?

# Joining fields values

# How to avoid defining a variable to hold a function result which might be needed only once

# Parsing structured text file in python

# django-mptt get_descendants for a list of nodes

# dict of internal keys

# Efficiently detect sign-changes in python




In [None]:
results = generator(texts, max_length=256, num_return_sequences=1, batch_size=n_batch)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
def preproc_answer(s):
    return s[s.index('\n'):].strip()


for i, r in enumerate(results):
    res = preproc_answer(r[0]['generated_text'])

    print(f'{i}. Question:', texts[i].strip())
    print(f'{i}. Answer:\n' + res)

0. Question: # Defining the name of a ManyToOne relationship in Django
0. Answer:
class Book(models.Model):
    title = models.CharField(max_length=255)
    desc = models.TextField()
    uploaded_by = models.ForeignKey(User, related_name="books_uploaded", on_delete=models.CASCADE)
    uploaded_on = models.DateTimeField(auto_now_add=True)


class Author(models.Model):
    name = models.CharField(max_length=255)
    birth = models.DateField(null=True, blank=True)
    description = models.TextField()
1. Question: # How to parse the header files of the pcap file?
1. Answer:
from scapy.config import conf
from scapy.packet import *
from random import randrange
from sys import argv

# Choose the file of the data.
mode = 'w' if len(argv) > 1 else 'r'
if len(argv) > 1:
    dir_ = argv[1]
    fname = argv[2] + '.pcap'
else:
    dir_ = '.'
    fname = './' + argv[0].lower()

def read_lines(file, mode=mode):
    """
    Read lines from a log file and convert them to a list of packets.

    The par

In [None]:
# Generate answers, this will take some time
batch_size = 64
max_length = 256

questions = [preproc(q) for q in dataset['train']['question']]
results = generator(questions, max_length=max_length, num_return_sequences=1, batch_size=batch_size)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [None]:
def assign_generated(item, idx):
    item['label'] = 1
    item['text'] = preproc_answer(results[idx][0]['generated_text'])

    return item

train_generated = dataset['train'].map(assign_generated, with_indices=True)
dataset['train'] = concatenate_datasets([dataset['train'], train_generated])

print(dataset['train'][0])
print(train_generated[0])

assert len(train_generated) * 2 == len(dataset['train'])

print(dataset['train'][0]['label'])
print(dataset['train'][-1]['label'])

{'question_id': 21219262, 'question': 'Defining the name of a ManyToOne relationship in Django', 'text': "class SpecificUserProfile(UserProfile):\n    referrer = models.ForeignKey('self', related_name='referred')\n", 'label': 0}
{'question_id': 21219262, 'question': 'Defining the name of a ManyToOne relationship in Django', 'text': 'def M2M_field_m2m(instance, m2m):\n    class M2MField(ForeignObject):\n        related_model = instance.__class__\n        # https://docs.python.org/3/tutorial/datastructures.html#string-and-unicode-string\n        # https://docs.python.org/2/reference/lexical_analysis.html#string-and-unicode-string-repr\n        # https://docs.python.org/2/howto/descriptor-hints.html#dynamic-and-static-class-attributes\n        # https://blog.miguelgrinberg.com/post/using-precursors-to-pythonic-interpreter.html\n\n        def __init__(self, related_model, to, from_fields, to_fields, **kwargs):\n            # to_fields[0] is the object, not a field, as we were just dealing 

In [None]:
dataset = split_dataset(dataset)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['question_id', 'question', 'text', 'label'],
        num_rows: 3694
    })
    test: Dataset({
        features: ['question_id', 'question', 'text', 'label'],
        num_rows: 410
    })
})


In [None]:
print(dataset['train'].features)
print(dataset['test'].features)

{'question_id': Value(dtype='int32', id=None), 'question': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}
{'question_id': Value(dtype='int32', id=None), 'question': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


In [None]:
# Save
!rm -rf staqc_man_python_codegen.hf
dataset.save_to_disk("staqc_man_python_codegen.hf")
!rm -f staqc_man_python_codegen.hf.zip
!zip -r staqc_man_python_codegen.hf.zip staqc_man_python_codegen.hf

Saving the dataset (0/1 shards):   0%|          | 0/3694 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/410 [00:00<?, ? examples/s]

  adding: staqc_man_python_codegen.hf/ (stored 0%)
  adding: staqc_man_python_codegen.hf/dataset_dict.json (stored 0%)
  adding: staqc_man_python_codegen.hf/train/ (stored 0%)
  adding: staqc_man_python_codegen.hf/train/data-00000-of-00001.arrow (deflated 64%)
  adding: staqc_man_python_codegen.hf/train/state.json (deflated 41%)
  adding: staqc_man_python_codegen.hf/train/dataset_info.json (deflated 54%)
  adding: staqc_man_python_codegen.hf/test/ (stored 0%)
  adding: staqc_man_python_codegen.hf/test/data-00000-of-00001.arrow (deflated 61%)
  adding: staqc_man_python_codegen.hf/test/state.json (deflated 42%)
  adding: staqc_man_python_codegen.hf/test/dataset_info.json (deflated 54%)


In [None]:
# Test save
ds = load_from_disk('staqc_man_python_codegen.hf')
print(ds)
print(ds['train'][0]['text'])

DatasetDict({
    train: Dataset({
        features: ['question_id', 'question', 'text', 'label'],
        num_rows: 3694
    })
    test: Dataset({
        features: ['question_id', 'question', 'text', 'label'],
        num_rows: 410
    })
})
offset = 0
data1 = np.memmap('tmp', dtype='i', mode='r+', order='F',
                  offset=0, shape=(size1))
offset += size1*byte_size
data2 = np.memmap('tmp', dtype='i', mode='r+', order='F',
                  offset=offset, shape=(size2))
offset += size1*byte_size
data3 = np.memmap('tmp', dtype='i', mode='r+', order='F',
                  offset=offset, shape=(size3))

