# Processing of Jupyter Notebooks on Github

Starting from a set of scripts that were converted from Jupyter notebooks, we perform Token search, then post-process the data for further analysis.

Our end result should be a DataFrame where each row corresponds to a script, and there is a column for each Token, indicating the number of times a token appears in a script.

In [None]:
import ast
import json
import os
from collections import defaultdict
from inspect import isclass, isfunction, ismodule

import matplotlib.pyplot as plt
import modin.pandas as mpd
import numpy as np
import pandas as pd
import ray
import regex

%matplotlib inline

In [None]:
"""
Compile a list of pandas functions recursively.
"""

# Get all the possible functions from these pandas classes and their subclasses.
allowed_classes = [
    pd,
    pd.DataFrame,
    pd.Series,
    pd.io,
    pd.core,
    pd.Index,
    pd.RangeIndex,
    pd.CategoricalIndex,
    pd.IntervalIndex,
    pd.MultiIndex,
    pd.IndexSlice,
    pd.DatetimeIndex,
    pd.TimedeltaIndex,
    pd.PeriodIndex,
    pd.Timestamp,
    pd.Timedelta,
    pd.DatetimeTZDtype,
    pd.Period,
    pd.Interval,
    pd.Categorical,
    pd.arrays,
    pd.tseries,
    pd.plotting,
    pd.api,
]
classes = [(pd, "pd")]

functions = set()
indexers = ["iloc", "iat", "ix", "loc", "at"]

while classes:
    obj, prefix = classes.pop()
    for token, t in vars(obj).items():
        # We do not consider unders, duners, or properties.
        if token[0] == "_" or token[:2] == "__":
            continue
        elif isfunction(t):
            functions.add(f"{prefix}.{token}")
        elif isclass(t) or ismodule(t):
            if (
                prefix.count(".") > 5 or t not in allowed_classes
            ):  # Prune search tree depth.
                continue
            classes.append((t, f"{prefix}.{token}"))
        else:
            # Ignore all others.
            continue

In [None]:
# Compute the set of unique function names.
f_set = set([f.split(".")[-1] for f in functions])
len(f_set)

In [None]:
# To reduce false positives from other libraries, in in the function .sum(
# we block numpy and matplotlib prefixes. Then convert to regex tokens.

blocked_prefixes = "(?<!numpy|np|plt|matplotlib)"
function_token_set = {f"{blocked_prefixes}\.{f}\(" for f in f_set}
indexer_token_set = {f"\.{indexer}\[" for indexer in indexers}
pandas_token_set = {"pd", "pandas"}
search_tokens_set = function_token_set | indexer_token_set | pandas_token_set

In [None]:
# Define the helper we apply to the DataFrame to parse a script for its tokens.

from collections import Counter


def parse_file_tokens(file_path, search_tokens):
    """Parse the file and search for the desired regex expressions.

    Parameters
    ----------
    file_path: str
        File path to search for.
    search_tokens: str
        Regex expression as a string.
    """
    with open(file_path) as f:
        contents = f.read()
    search_tokens = regex.compile("|".join(search_tokens))
    return Counter(regex.findall(search_tokens, contents))

In [None]:
# Create a DataFrame of the python scripts.

python_scripts = []
scripts_dir = "../data/big_dataset/converted_scripts/"

for f in os.listdir(scripts_dir):
    if not f.startswith(".") and not f.endswith("csv"):
        python_scripts.append(os.path.join(scripts_dir, f))

python_scripts_df = mpd.DataFrame(python_scripts, columns=["script_path"])
python_scripts_df

In [None]:
# Use parse script to get a Counter of the tokens in each script.

python_scripts_df["script_tokens"] = python_scripts_df.apply(
    lambda path: parse_file_tokens(path["script_path"], search_tokens_set), axis="columns"
)

python_scripts_df

In [None]:
python_scripts_df.to_csv("../python_script_tokens_df.csv")

## Process the token counters to DF columns

In [None]:
python_scripts_df = mpd.read_csv(
    "../python_script_tokens_df.csv"
)  # Given that we have a column of Counter objects, we now want to expand them to their own columns, for each token.
python_scripts_df

In [None]:
# Get the display names of all search tokens.
function_token_set = {f".{f}(" for f in f_set}
indexer_token_set = {f".{indexer}[" for indexer in indexers}
pandas_token_set = {"pd", "pandas"}
all_tokens = function_token_set | indexer_token_set | pandas_token_set

In [None]:
def get_token(row: str, token):
    row = ast.literal_eval(row)
    try:
        return row[token]
    except Exception:
        return 0

In [None]:
for token in all_tokens:
    python_scripts_df[token] = python_scripts_df["script_tokens"].apply(
        lambda row: get_token(row, token)
    )
python_scripts_df

In [None]:
python_scripts_df.to_csv("../token_breakdown.csv")

## Perform post-processing

In [None]:
# Confirm all column names are unique
assert all(
    [count == 1 for f, count in Counter(python_scripts_df.columns).items()]
), "Column names should be unique."

### Remove irrevelant tokens

Like Int16Dtype

In [None]:
# Get all the possible functions from base pandas class, dataframes, and series.

# Also add an exclusion set.

search_tokens_set = set()
exclude = set()
objects = [pd, pd.DataFrame, pd.Series]
indexers = ["iloc", "iat", "ix", "loc", "at"]
for obj in objects:
    for token in dir(obj):
        # We do not consider private functions or properties
        if token[0] == "_" and token[:2] != "__":
            continue
        elif inspect.isfunction(getattr(obj, token)):
            # For functions, we search for ".function_name("
            continue
        elif token in indexers:
            # For indexing functions, we searhc for ".indexing_function["
            continue
        else:
            # For properties, we add only a period in front
            exclude.add(".{}".format(token))

In [None]:
# Additional tokens to exclude.
exclude = exclude.union(
    [".datetime", ".base", "pandas", "pd", "pd.DataFrame", "pd.dataframe"]
)
exclude = exclude.intersection(set(python_scripts_df.columns))
print(len(exclude))
exclude

In [None]:
# Remove these tokens from the df.
python_scripts_df.drop(labels=exclude, inplace=True, axis=1)
python_scripts_df

In [None]:
# Remove rows with no tokens.
python_scripts_df["token_count"] = python_scripts_df.apply(
    lambda row: row[3:].sum(), axis=1
)
python_scripts_df

In [None]:
python_scripts_df.drop(
    python_scripts_df[python_scripts_df["token_count"] == 0].index, inplace=True
)

In [None]:
python_scripts_df.drop(columns=["token_count"], inplace=True)

In [None]:
python_scripts_df = python_scripts_df.drop(columns=["Unnamed: 0"])

In [None]:
python_scripts_df.to_csv("../data/filtered_token_breakdown.csv",index=None)