# Processing of Jupyter Notebooks from Github

Starting from a set of scripts that were converted from Jupyter notebooks, we perform Token search, then post-process the data for further analysis.

Our end result should be a DataFrame where each row corresponds to a script, and there is a column for each Token, indicating the number of times a token appears in a script.

In [28]:
import ast
import os
from collections import Counter, defaultdict
from inspect import isclass, isfunction, ismodule

import matplotlib.pyplot as plt
import modin.pandas as mpd
import pandas as pd
import ray
import regex

ray.init()

2022-02-09 09:10:20,448	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:37852',
 'object_store_address': '/tmp/ray/session_2022-02-09_09-10-17_027754_60183/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-02-09_09-10-17_027754_60183/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2022-02-09_09-10-17_027754_60183',
 'metrics_export_port': 62176,
 'node_id': '1878990dd19cd292b5814d33e00db5ed3a3c5ecde37af5864b0baff2'}

## Generate the set of search tokens

In [8]:
"""
Compile a list of pandas functions.
"""

# Get all the possible functions from these pandas classes and their subclasses.
allowed_classes = [
    pd,
    pd.DataFrame,
    pd.Series,
    pd.io,
    pd.core,
    pd.Index,
    pd.RangeIndex,
    pd.CategoricalIndex,
    pd.IntervalIndex,
    pd.MultiIndex,
    pd.IndexSlice,
    pd.DatetimeIndex,
    pd.TimedeltaIndex,
    pd.PeriodIndex,
    pd.Timestamp,
    pd.Timedelta,
    pd.DatetimeTZDtype,
    pd.Period,
    pd.Interval,
    pd.Categorical,
    pd.arrays,
    pd.tseries,
    pd.plotting,
    pd.api,
]
classes = [(pd, "pandas")]

functions = set()
indexers = ["iloc", "iat", "ix", "loc", "at"]

while classes:
    obj, prefix = classes.pop()
    for token, t in vars(obj).items():
        # We do not consider unders, duners, or properties.
        if token[0] == "_" or token[:2] == "__":
            continue
        elif isfunction(t):
            if prefix.split(".")[-1] == "io":
                print(type(obj), prefix)
            functions.add(f"{prefix}.{token}")
        elif isclass(t) or ismodule(t):
            if (
                prefix.count(".") > 5  # Prune search tree depth.
                or t not in allowed_classes
                or prefix.split(".")[-1] == token
            ):
                continue
            classes.append((t, f"{prefix}.{token}"))
        else:
            continue

# Compute the set of unique function names.
function_set = set([f.split(".")[-1] for f in functions])
len(function_set)

299

In [9]:
"""
To reduce false positives, we block the following prefixes:
    - numpy and matplotlib prefixes
    - ' and " for string functions, like format
    - ] and } for list and dict types
    
Then we convert function names to regex tokens.
"""

blocked_prefixes = "(?<!numpy|np|plt|matplotlib|\"|'|]|})"
function_token_set = {f"{blocked_prefixes}\.{f}\(" for f in function_set}
indexer_token_set = {f"\.{indexer}\[" for indexer in indexers}
pandas_token_set = {"pd", "pandas"}
search_tokens_set = function_token_set | indexer_token_set | pandas_token_set

## Search each script file for search tokens

In [10]:
# Create a DataFrame of the python scripts.

python_scripts = []
scripts_dir = ""

assert scripts_dir, "Set the script directory with scripts_dir."

for f in os.listdir(scripts_dir):
    if not f.startswith(".") and not f.endswith("csv"):
        python_scripts.append(os.path.join(scripts_dir, f))

python_scripts_df = mpd.DataFrame(python_scripts, columns=["script_path"])
python_scripts_df


    import ray
    ray.init()



Unnamed: 0,script_path
0,../../pandas-api-analysis-private/data/big_dat...
1,../../pandas-api-analysis-private/data/big_dat...
2,../../pandas-api-analysis-private/data/big_dat...
3,../../pandas-api-analysis-private/data/big_dat...
4,../../pandas-api-analysis-private/data/big_dat...
...,...
250042,../../pandas-api-analysis-private/data/big_dat...
250043,../../pandas-api-analysis-private/data/big_dat...
250044,../../pandas-api-analysis-private/data/big_dat...
250045,../../pandas-api-analysis-private/data/big_dat...


In [11]:
# Define the helper we apply to the DataFrame to parse a script for its tokens.


def parse_file_tokens(file_path, search_tokens):
    """Parse the file and search for the desired regex expressions.

    Parameters
    ----------
    file_path: str
        File path to search for.
    search_tokens: iterable
        Regex expressions as an iterable.
    """
    with open(file_path) as f:
        contents = f.read()
    search_tokens = regex.compile("|".join(search_tokens))
    return Counter(regex.findall(search_tokens, contents))

In [12]:
# Warning: this step will take > 8 min.
# Use parse script to get a Counter of the tokens in each script.

script_tokens_ser = python_scripts_df.apply(
    lambda row: parse_file_tokens(row["script_path"], search_tokens_set),
    axis="columns",
)

script_tokens_ser



Unnamed: 0,__reduced__
0,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}"
1,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':..."
2,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack..."
3,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf..."
4,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ..."
...,...
250042,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro..."
250043,"{'pandas': 1, 'pd': 5, '.mean(': 3}"
250044,"{'pandas': 1, '.fillna(': 1}"
250045,"{'pd': 2, 'pandas': 1, '.append(': 2}"


In [13]:
# Combine the scripts DataFrame and the new series.
python_script_tokens_df = mpd.concat([python_scripts_df, script_tokens_ser], axis=1)
python_script_tokens_df.rename({"__reduced__": "script_tokens"}, axis=1, inplace=True)
python_script_tokens_df

Unnamed: 0,script_path,script_tokens
0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}"
1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':..."
2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack..."
3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf..."
4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ..."
...,...,...
250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro..."
250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}"
250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}"
250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}"


In [19]:
# (Optional) Save to csv.
python_script_tokens_df.to_csv("python_script_tokens_df.csv")

[2m[36m(apply_func pid=60205)[0m 
[2m[36m(apply_func pid=60303)[0m 


## Transform the token counters to DF columns

In [29]:
# (Optional) Read from csv.
python_script_tokens_df = mpd.read_csv("python_script_tokens_df.csv")
python_script_tokens_df

Unnamed: 0.1,Unnamed: 0,script_path,script_tokens,.stack(,.timedelta_range(,.mask(,.is_dtype_equal(,.is_interval(,.pow(,.isocalendar(,...,.merge(,.is_type_compatible(,.nunique(,.pop(,.is_boolean(,.replace(,.check_for_ordered(,.is_mixed(,.repeat(,.ge(
0,0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...",3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...",0,0,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0
4,4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250042,250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...",0,0,0,0,0,0,0,...,0,0,0,5,0,1,0,0,0,0
250043,250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250044,250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250045,250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Get the display names of all search tokens.
function_token_dnames = {f".{f}(" for f in function_set}
indexer_token_dnames = {f".{indexer}[" for indexer in indexers}
pandas_token_dnames = {"pd", "pandas"}
all_tokens = function_token_dnames | indexer_token_dnames | pandas_token_dnames

In [32]:
# Define helper to get the count for a token.


def get_token(row: str, token):
    try:
        return row[token]
    except Exception:
        return 0

In [None]:
# Warning: this step will take >10 min.

for token in all_tokens:
    python_script_tokens_df[token] = python_script_tokens_df["script_tokens"].apply(
        lambda row: get_token(row, token)
    )
python_script_tokens_df

## Perform post-processing

In [33]:
# Confirm all column names are unique
assert all(
    [count == 1 for f, count in Counter(python_script_tokens_df.columns).items()]
), "Column names should be unique."

### Remove irrevelant tokens

In [34]:
# Additional tokens to exclude.
exclude = set(["pandas", "pd"])
exclude = exclude.intersection(set(python_script_tokens_df.columns))
print(len(exclude))
exclude

2


{'pandas', 'pd'}

In [35]:
# Remove these tokens from the df.
python_script_tokens_df.drop(labels=exclude, inplace=True, axis=1)
python_script_tokens_df

Unnamed: 0.1,Unnamed: 0,script_path,script_tokens,.stack(,.timedelta_range(,.mask(,.is_dtype_equal(,.is_interval(,.pow(,.isocalendar(,...,.merge(,.is_type_compatible(,.nunique(,.pop(,.is_boolean(,.replace(,.check_for_ordered(,.is_mixed(,.repeat(,.ge(
0,0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...",3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...",0,0,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0
4,4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250042,250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...",0,0,0,0,0,0,0,...,0,0,0,5,0,1,0,0,0,0
250043,250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250044,250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250045,250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
# Remove rows with no tokens.
token_count = python_script_tokens_df.iloc[:, 2:].sum(axis=1)
token_count = token_count[token_count == 0].index
token_count

Int64Index([    24,     66,     69,     94,    105,    108,    113,    121,
               144,    149,
            ...
            249837, 249844, 249853, 249859, 249865, 249948, 249970, 249974,
            250007, 250022],
           dtype='int64', length=15842)

In [67]:
python_script_tokens_df = python_script_tokens_df.drop(labels=token_count, axis=0)
python_script_tokens_df

Unnamed: 0.1,Unnamed: 0,script_path,script_tokens,.stack(,.timedelta_range(,.mask(,.is_dtype_equal(,.is_interval(,.pow(,.isocalendar(,...,.merge(,.is_type_compatible(,.nunique(,.pop(,.is_boolean(,.replace(,.check_for_ordered(,.is_mixed(,.repeat(,.ge(
0,0,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 2, '.read_csv(': 1}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 3, 'pd': 1, '.append(': 2, '.min(':...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 4, '.stack(': 3, '.unstack...",3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 2, 'pd': 13, '.read_csv(': 2, '.inf...",0,0,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0
4,4,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 4, '.join(': 1, '.iloc[': ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250042,250042,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 10, '.read_csv(': 2, '.dro...",0,0,0,0,0,0,0,...,0,0,0,5,0,1,0,0,0,0
250043,250043,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, 'pd': 5, '.mean(': 3}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250044,250044,../../pandas-api-analysis-private/data/big_dat...,"{'pandas': 1, '.fillna(': 1}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250045,250045,../../pandas-api-analysis-private/data/big_dat...,"{'pd': 2, 'pandas': 1, '.append(': 2}",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
# (Optional) Save to csv.
# python_script_tokens_df.to_csv("filtered_token_breakdown.csv", index=None)

[2m[36m(apply_func pid=59037)[0m 
