In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, PretrainedConfig

In [52]:
# model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
# tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeGPT-small-py-adaptedGPT2")
model = AutoModelForCausalLM.from_pretrained("microsoft/CodeGPT-small-py-adaptedGPT2", pad_token_id=tokenizer.eos_token_id)

# config = PretrainedConfig(model="distilgpt2", pad_token_id=50256)
# text_generator = pipeline("text-generation", model="distilgpt2", config=config)

Downloading:   0%|          | 0.00/720 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/177 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Downloading:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [53]:
def read_samples():
    with open("samples.txt") as f:
        content = f.readlines()
    data = {
        "input": [],
        "output": []
    }
    last = "output"
    for i, line in enumerate(content):
        if line.strip() == "":
            continue
        if line.startswith("Input:"):
            if last != "output":
                raise Exception("Input on line %i does not match a previous output" % i)
            last = "input"
        elif line.startswith("Output:"):
            if last != "input":
                raise Exception("Input on line %i does not match a previous output" % i)
            last = "output"
        else:
            data[last].append(line.strip())
    
    return list(zip(data["input"], data["output"]))
samples = read_samples()
samples[1]

('pandas.DataFrame.groupby¶ ... Group DataFrame using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, ...',
 'pandas.DataFrame.groupby')

In [49]:
def build_context(samples):
    context = ""
    for (input_, output) in samples:
        context += "Input: " + input_ + "\n"
        context += "Output: " + output + "\n"
        context += "\n"
    return context
print(build_context(samples))

Input: For DataFrame objects, a string indicating either a column name or an index level name to be used to group. df.groupby('A') is just syntactic sugar for df.groupby(df
Output: df.groupby('A') df.groupby(df

Input: pandas.DataFrame.groupby¶ ... Group DataFrame using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, ...
Output: pandas.DataFrame.groupby

Input: In this tutorial, you'll learn how to work adeptly with the Pandas GroupBy facility while mastering ways to manipulate, transform, and summarize data. You'll work ...
Output: None

Input: Pandas dataframe.groupby() function is used to split the data into groups based on some criteria. pandas objects can be split on any of their axes.
Output: dataframe.groupby()

Input: There's probably a slicker way to do this but this works: def reindex_by_date(df): dates = pd.date_range(df.index.min(), df.index.max()) return
Output: def reindex_by_date(df): dates = pd.date_range(df.ind

In [54]:
PADDING_TEXT = build_context(samples[0:10])

prompt = r"""Input: Series. For data-only list. By passing a list type object to the first argument of each constructor pandas.DataFrame() and
Output: """

inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(inputs, max_length=len(inputs[0]) + 20, do_sample=True, num_return_sequences=5)

prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))

print("Output:\n" + 100 * '-')
for i, output in enumerate(outputs):
    generated = tokenizer.decode(output)[prompt_length:]
    print("{}: {}".format(i, generated))

# print(text_generator(PADDING_TEXT + prompt, max_length=len(PADDING_TEXT + prompt) + 50, do_sample=False))
# %time print(text_generator(PADDING_TEXT + prompt, max_length=1024, truncation=True))


Output:
----------------------------------------------------------------------------------------------------
0: type is "list". For example: if an int, you can simply give "int
1: d.Series([["i_id1", "i_id2"]])
2:  : 1}**, 'gender' : 2, 'cant'
3: tr(item,k)] for item in item} | cat for item in item
4: p.arange(df.shape[0], dtype=int) """ #


In [58]:
nlp = pipeline("question-answering")

In [65]:
def build_questions_context(samples):
    context = ""
    for (input_, output) in samples:
        context += "The piece of in \"" + input_ + "\" is " + output + "\n"
        context += "\n"
    return context
print(build_questions_context(samples))

The piece of in "For DataFrame objects, a string indicating either a column name or an index level name to be used to group. df.groupby('A') is just syntactic sugar for df.groupby(df" is df.groupby('A') df.groupby(df

The piece of in "pandas.DataFrame.groupby¶ ... Group DataFrame using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, ..." is pandas.DataFrame.groupby

The piece of in "In this tutorial, you'll learn how to work adeptly with the Pandas GroupBy facility while mastering ways to manipulate, transform, and summarize data. You'll work ..." is None

The piece of in "Pandas dataframe.groupby() function is used to split the data into groups based on some criteria. pandas objects can be split on any of their axes." is dataframe.groupby()

The piece of in "There's probably a slicker way to do this but this works: def reindex_by_date(df): dates = pd.date_range(df.index.min(), df.index.max()) return" is def reindex_by_date(df)

In [68]:
result = nlp(
    question="What is the piece of code in \"Series. For data-only list. By passing a list type object to the first argument of each constructor pandas.DataFrame().foo() and\"?",
    context=build_questions_context(samples)
)
result

{'score': 0.24871328473091125,
 'start': 3368,
 'end': 3418,
 'answer': '"What is the best way to filter a Java Collection?'}

In [70]:
from transformers import AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base-mlm")
model = AutoModelForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")



Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [82]:
code_example = build_context(samples[0:1]) + prompt + "<mask>"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(code_example)
print("\n".join([ str(x) for x in outputs ]))

{'sequence': "Input: For DataFrame objects, a string indicating either a column name or an index level name to be used to group. df.groupby('A') is just syntactic sugar for df.groupby(df\nOutput: df.groupby('A') df.groupby(df\n\nInput: Series. For data-only list. By passing a list type object to the first argument of each constructor pandas.DataFrame() and\nOutput: List", 'score': 0.40711578726768494, 'token': 9527, 'token_str': ' List'}
{'sequence': "Input: For DataFrame objects, a string indicating either a column name or an index level name to be used to group. df.groupby('A') is just syntactic sugar for df.groupby(df\nOutput: df.groupby('A') df.groupby(df\n\nInput: Series. For data-only list. By passing a list type object to the first argument of each constructor pandas.DataFrame() and\nOutput: Series", 'score': 0.22264915704727173, 'token': 3265, 'token_str': ' Series'}
{'sequence': "Input: For DataFrame objects, a string indicating either a column name or an index level name to b

In [83]:
from transformers import pipeline

nlp = pipeline("ner", model="mrm8488/codebert-base-finetuned-stackoverflow-ner")

Downloading:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

In [88]:
sequence = "The easiest way is to use sed (or perl): sed -i -e 's/abc/XYZ/g' /tmp/file.txt. Which will invoke sed to do an in-place edit due to the -i option"

%time nlp(sequence)

CPU times: user 240 ms, sys: 8.99 ms, total: 249 ms
Wall time: 259 ms


[{'word': 'Ġsed',
  'score': 0.7651958465576172,
  'entity': 'B-Code_Block',
  'index': 7,
  'start': 26,
  'end': 29},
 {'word': 'Ġperl',
  'score': 0.9975582361221313,
  'entity': 'B-Language',
  'index': 10,
  'start': 34,
  'end': 38},
 {'word': 'Ġsed',
  'score': 0.9983819723129272,
  'entity': 'B-Code_Block',
  'index': 12,
  'start': 41,
  'end': 44},
 {'word': 'Ġ-',
  'score': 0.9970406293869019,
  'entity': 'I-Code_Block',
  'index': 13,
  'start': 45,
  'end': 46},
 {'word': 'i',
  'score': 0.997124195098877,
  'entity': 'I-Code_Block',
  'index': 14,
  'start': 46,
  'end': 47},
 {'word': 'Ġ-',
  'score': 0.997551679611206,
  'entity': 'I-Code_Block',
  'index': 15,
  'start': 48,
  'end': 49},
 {'word': 'e',
  'score': 0.9970588684082031,
  'entity': 'I-Code_Block',
  'index': 16,
  'start': 49,
  'end': 50},
 {'word': "Ġ'",
  'score': 0.9104116559028625,
  'entity': 'I-Code_Block',
  'index': 17,
  'start': 51,
  'end': 52},
 {'word': 's',
  'score': 0.988348126411438,
  '