# Setup

In [6]:
!nvidia-smi

Fri Apr  7 19:48:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
| N/A   60C    P0   287W / 300W |  19208MiB / 32480MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [3]:
from datasets import load_dataset
from itertools import islice
from tqdm import tqdm
import pandas as pd
import ast
import astunparse

## Parsing python code functions (name, args, code)

we are using AST lib here, extracting all of the function definitions form the abstract syntax tree

In [4]:
filename = "code_sample/code_sample.py"
with open(filename) as file:
    node = ast.parse(file.read())
    
def get_function_info(functionNode):
    functionName = functionNode.name
    functionArgs = [arg.arg for arg in functionNode.args.args]
    functionCode = astunparse.unparse(functionNode)
    return [functionName, functionArgs, functionCode]

def code_to_functions_df(code):
    node = ast.parse(code)
    functions = [n for n in node.body if isinstance(n, ast.FunctionDef)]
    classes = [n for n in node.body if isinstance(n, ast.ClassDef)]

    standalone_functions = [get_function_info(function) for function in functions]
    
    class_functions = []
        
    for class_ in classes:
        methods = [n for n in class_.body if isinstance(n, ast.FunctionDef)]
        cur_class_functions = [get_function_info(method) for method in methods]
        class_functions.extend(cur_class_functions)
    
    return pd.DataFrame(standalone_functions + class_functions,
                      columns =['functionName', 'functionArgs', 'functionCode'])

## Loading datasets (going for 100000 examples now)

In [44]:
ds = iter(load_dataset("codeparrot/github-code", streaming=True,
                  split="train", languages=["Python"]))
pycode_sample = []
for i in tqdm(range(100000)):
    pycode_sample.append(next(ds)['code'])

100%|██████████| 100000/100000 [13:45<00:00, 121.11it/s]


Now we extract function data from each file and concat it all into giant df, which we will then store as a parquet 

p.s. yes this is a dumb skip of all the files it can't parse, got too much data to care

In [54]:
pycode_sample_dfs = []
for i in tqdm(range(100000)):
    try:
        pycode_sample_dfs.append(code_to_functions_df(pycode_sample[i]))
    except Exception: 
        pass

100%|██████████| 100000/100000 [07:36<00:00, 218.99it/s]


In [55]:
len(pycode_sample_dfs)

86279

In [56]:
giga_df = pd.concat(pycode_sample_dfs)
len(giga_df)

746787

In [57]:
giga_df.to_parquet("pyfunc_86k.parquet")

100k py files => 86k readable => 746k functions => 136 MB of data

In [4]:
giga_df_2 = pd.read_parquet("pyfunc_86k.parquet")

In [6]:
len(giga_df_2)

746787

In [5]:
giga_df_2.sample(10)

Unnamed: 0,functionName,functionArgs,functionCode
5,_json_reddit_objecter,"[self, json_data]","\n\ndef _json_reddit_objecter(self, json_data)..."
17,test_get_repositories_with_username_many,[self],\n\ndef test_get_repositories_with_username_ma...
11,train_mnist_cnn,"[DIST, local_rank, world_size, nccl_id, spars,...","\n\ndef train_mnist_cnn(DIST=False, local_rank..."
2,post_run,[self],\n\ndef post_run(self):\n pass\n
2,init_jinja,[app],\n\ndef init_jinja(app):\n _jinja_filters =...
10,pack,[self],\n\ndef pack(self):\n raise NotImplementedE...
15,__init__,"[self, n]","\n\ndef __init__(self, n):\n if (abs((n - i..."
111,test_index_to_string_with_default_value,[self],\n\ndef test_index_to_string_with_default_valu...
140,_clear_batch,"[self, pp]","\n\ndef _clear_batch(self, pp: PrePrepare):\n ..."
58,filter,"[self, values]","\n\n@debug()\ndef filter(self, values):\n r..."


100k py files => 86k readable => 764k functions => 136 MB of data

320k py files => 272k readable => 2.33M functions => 429 MB of data

## Let's amp it up!!!! (3 million python files)
(spoiler alert - I used only 3% of it ::)

In [None]:
ds = iter(load_dataset("codeparrot/github-code", streaming=True,
                  split="train", languages=["Python"]))
pycode_sample = []
for i in tqdm(range(3000000)):
    try:
        pycode_sample.append(next(ds)['code'])
    except Exception: 
        pass

In [None]:
len(pycode_sample)

In [None]:
pycode_sample_dfs = []
for i in tqdm(range(3000000)):
    try:
        pycode_sample_dfs.append(code_to_functions_df(pycode_sample[i]))
    except Exception: 
        pass

In [None]:
len(pycode_sample_dfs)

In [None]:
giga_df_3 = pd.concat(pycode_sample_dfs)
len(giga_df_3)

In [None]:
giga_df_3.to_parquet("pyfunc_3M.parquet")

3M py files => 22M functions => 4 GB of data