# Processing of Jupyter Notebooks from Github

Starting from the raw 1.2 million Jupyter notebooks, we convert the notebooks to scripts with `nbconvert`, then take the middle 50% of notebooks by size as our sample. From the resulting set of scripts we do a single token search for "pandas" to yield the resulting `converted_scripts.zip` files.

Using the `converted_scripts/` set of around ~250k files, which can be downloaded at [here](https://drive.google.com/file/d/12M3n_gsejc1xmrFoAGwHUgFTpHIlfa2i/view?usp=sharing), we perform token search, then post-process the data for further analysis. This notebook begins at this step.

Our end result should be a DataFrame where each row corresponds to a script, and there is a column for each token, indicating the number of times a token appears in a script.

In [1]:
import ast
import os
from collections import Counter, defaultdict
from inspect import isclass, isfunction, ismodule

import matplotlib.pyplot as plt
import modin.pandas as mpd
import pandas as pd
import ray
import regex

ray.init()

2022-02-09 10:37:07,977	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:17991',
 'object_store_address': '/tmp/ray/session_2022-02-09_10-37-04_107820_61570/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-02-09_10-37-04_107820_61570/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2022-02-09_10-37-04_107820_61570',
 'metrics_export_port': 65533,
 'node_id': '135e2e2adac2c7cafd111186fda3b116a099ea6196c087b24bf4e532'}

## Generate the set of search tokens

In [2]:
"""
Compile a list of pandas functions.
"""

# Get all the possible functions from these pandas classes and their subclasses.
allowed_classes = [
    pd,
    pd.DataFrame,
    pd.Series,
    pd.io,
    pd.core,
    pd.Index,
    pd.RangeIndex,
    pd.CategoricalIndex,
    pd.IntervalIndex,
    pd.MultiIndex,
    pd.IndexSlice,
    pd.DatetimeIndex,
    pd.TimedeltaIndex,
    pd.PeriodIndex,
    pd.Timestamp,
    pd.Timedelta,
    pd.DatetimeTZDtype,
    pd.Period,
    pd.Interval,
    pd.Categorical,
    pd.arrays,
    pd.tseries,
    pd.plotting,
    pd.api,
]
classes = [(pd, "pandas")]

functions = set()
indexers = ["iloc", "iat", "ix", "loc", "at"]

while classes:
    obj, prefix = classes.pop()
    for token, t in vars(obj).items():
        # We do not consider unders, duners, or properties.
        if token[0] == "_" or token[:2] == "__":
            continue
        elif isfunction(t):
            if prefix.split(".")[-1] == "io":
                print(type(obj), prefix)
            functions.add(f"{prefix}.{token}")
        elif isclass(t) or ismodule(t):
            if (
                prefix.count(".") > 5  # Prune search tree depth.
                or t not in allowed_classes
                or prefix.split(".")[-1] == token
            ):
                continue
            classes.append((t, f"{prefix}.{token}"))
        else:
            continue

# Compute the set of unique function names.
function_set = set([f.split(".")[-1] for f in functions])
len(function_set)

301

In [3]:
"""
To reduce false positives, we block the following prefixes:
    - numpy and matplotlib prefixes
    - ' and " for string functions, like format
    - ] and } for list and dict types
    
Then we convert function names to regex tokens.
"""

blocked_prefixes = "(?<!numpy|np|plt|matplotlib|\"|'|]|})"
function_token_set = {f"{blocked_prefixes}\.{f}\(" for f in function_set}
indexer_token_set = {f"\.{indexer}\[" for indexer in indexers}
pandas_token_set = {"pd", "pandas"}
search_tokens_set = function_token_set | indexer_token_set | pandas_token_set

## Search each script file for search tokens

In [5]:
# Create a DataFrame of the python scripts.

python_scripts = []
scripts_dir = "../../pandas-api-analysis-private/data/big_dataset/converted_scripts/"

for f in os.listdir(scripts_dir):
    if not f.startswith(".") and not f.endswith("csv"):
        python_scripts.append(os.path.join(scripts_dir, f))

python_scripts_df = mpd.DataFrame(python_scripts, columns=["script_path"])
python_scripts_df



Unnamed: 0,script_path
0,../../pandas-api-analysis-private/data/big_dat...
1,../../pandas-api-analysis-private/data/big_dat...
2,../../pandas-api-analysis-private/data/big_dat...
3,../../pandas-api-analysis-private/data/big_dat...
4,../../pandas-api-analysis-private/data/big_dat...
...,...
250042,../../pandas-api-analysis-private/data/big_dat...
250043,../../pandas-api-analysis-private/data/big_dat...
250044,../../pandas-api-analysis-private/data/big_dat...
250045,../../pandas-api-analysis-private/data/big_dat...


In [6]:
# Define the helper we apply to the DataFrame to parse a script for its tokens.


def parse_file_tokens(file_path, search_tokens):
    """Parse the file and search for the desired regex expressions.

    Parameters
    ----------
    file_path: str
        File path to search for.
    search_tokens: iterable
        Regex expressions as an iterable.
    """
    with open(file_path) as f:
        contents = f.read()
    search_tokens = regex.compile("|".join(search_tokens))
    return Counter(regex.findall(search_tokens, contents))

In [7]:
# Warning: this step will take > 10 min.
# Use parse script to get a Counter of the tokens in each script.

script_tokens_ser = python_scripts_df.apply(
    lambda row: parse_file_tokens(row["script_path"], search_tokens_set),
    axis="columns",
)

script_tokens_ser



0                   {'pandas': 2, 'pd': 2, '.read_csv(': 1}
1         {'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...
2         {'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...
3         {'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...
4         {'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...
                                ...                        
250042    {'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...
250043                  {'pandas': 1, 'pd': 5, '.mean(': 3}
250044                         {'pandas': 1, '.fillna(': 1}
250045                {'pd': 2, 'pandas': 1, '.append(': 2}
250046    {'pandas': 2, 'pd': 23, '.read_csv(': 2, '.tol...
Length: 250047, dtype: object

In [8]:
# Combine the scripts DataFrame and the new series.
python_script_tokens_df = mpd.concat([python_scripts_df, script_tokens_ser], axis=1)
python_script_tokens_df.rename({0: "script_tokens"}, axis=1, inplace=True)
python_script_tokens_df

Unnamed: 0,script_path,script_tokens
0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}"
1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':..."
2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack..."
3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf..."
4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ..."
...,...,...
250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro..."
250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}"
250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}"
250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}"


## Transform the token counters to DF columns

In [9]:
# Get the display names of all search tokens.
function_token_dnames = {f".{f}(" for f in function_set}
indexer_token_dnames = {f".{indexer}[" for indexer in indexers}
pandas_token_dnames = {"pd", "pandas"}
all_tokens = function_token_dnames | indexer_token_dnames | pandas_token_dnames

In [10]:
# Define helper to get the count for a token.


def get_token(row: str, token):
    try:
        return row[token]
    except Exception:
        return 0

In [11]:
# Warning: this step will take > 10 min.

for token in all_tokens:
    python_script_tokens_df[token] = python_script_tokens_df["script_tokens"].apply(
        lambda row: get_token(row, token)
    )
python_script_tokens_df

[2m[36m(apply_list_of_funcs pid=61599)[0m 
[2m[36m(apply_list_of_funcs pid=61591)[0m 


[2m[36m(apply_list_of_funcs pid=61601)[0m 
[2m[36m(apply_list_of_funcs pid=61596)[0m 


[2m[36m(apply_list_of_funcs pid=61597)[0m 


Unnamed: 0,script_path,script_tokens,.isnull(,.cov(,.cumprod(,.to_native_types(,.to_timedelta(,.to_numpy(,.dropna(,.is_(,...,.scatter_matrix(,.clip(,.read_spss(,.le(,.interval_range(,.to_numeric(,.mode(,.applymap(,.as_ordered(,.cut(
0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...",1,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Perform post-processing

In [12]:
# Confirm all column names are unique
assert all(
    [count == 1 for f, count in Counter(python_script_tokens_df.columns).items()]
), "Column names should be unique."

### Remove irrevelant tokens

In [13]:
# Additional tokens to exclude.
exclude = set(["pandas", "pd"])
exclude = exclude.intersection(set(python_script_tokens_df.columns))
print(len(exclude))
exclude

2


{'pandas', 'pd'}

In [14]:
# Remove these tokens from the df.
python_script_tokens_df.drop(labels=exclude, inplace=True, axis=1)
python_script_tokens_df

Unnamed: 0,script_path,script_tokens,.isnull(,.cov(,.cumprod(,.to_native_types(,.to_timedelta(,.to_numpy(,.dropna(,.is_(,...,.scatter_matrix(,.clip(,.read_spss(,.le(,.interval_range(,.to_numeric(,.mode(,.applymap(,.as_ordered(,.cut(
0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...",1,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Remove rows with no tokens.
token_count = python_script_tokens_df.iloc[:, 2:].sum(axis=1)
token_count = token_count[token_count == 0].index
token_count

Int64Index([    24,     66,     69,     94,    105,    108,    113,    121,
               144,    149,
            ...
            249837, 249844, 249853, 249859, 249865, 249948, 249970, 249974,
            250007, 250022],
           dtype='int64', length=15772)

In [16]:
python_script_tokens_df = python_script_tokens_df.drop(labels=token_count, axis=0)
python_script_tokens_df

Unnamed: 0,script_path,script_tokens,.isnull(,.cov(,.cumprod(,.to_native_types(,.to_timedelta(,.to_numpy(,.dropna(,.is_(,...,.scatter_matrix(,.clip(,.read_spss(,.le(,.interval_range(,.to_numeric(,.mode(,.applymap(,.as_ordered(,.cut(
0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...",1,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# (Optional) Save to csv.
python_script_tokens_df.to_csv("filtered_token_breakdown.csv", index=None)