In [98]:
import pandas as pd
import subprocess
import re
import json

In [2]:
df = pd.read_csv("gh_rust_5+.csv")

In [231]:
def get_parse_tree(code):
    parse_tree = subprocess.run(["../bin/rust-analyzer", "parse"], stdout=subprocess.PIPE, text=True, input=code)
    return parse_tree.stdout

def get_symbol_table(code):
    symbols = subprocess.run(["../bin/rust-analyzer", "symbols"], stdout=subprocess.PIPE, text=True, input=code)
    return symbols.stdout

def parse_token(parse_str):
    token, additional_str = tuple(parse_str.split("@", 1))
    splitted = additional_str.split(" ", 1)
    start, end = tuple(splitted[0].split(".."))
    return token, (int(start), int(end))

def parse_symbol(symbol):
    symbol = symbol[:-2].replace("StructureNode { ", "")
    d = {}
    for s in symbol.split(","):
        if len(s) == 0:
            continue
        items = s.strip().split(":", 1)
        for i in range(len(items) - 1):
            k = items[i].strip()
            v = items[i+1].strip()
            v = v.replace("\"", "")
            if k == "parent":
                if v == "None":
                    v = None
                else:
                    v = int(v[:-1].replace("Some(", ""))
            if k == "navigation_range" or k == "node_range":
                v = tuple(map(int, v.split("..")))
            if k == "kind":
                v = v[:-1].replace("SymbolKind(", "").lower()
            if k == "deprecated":
                v = True if v == "true" else False
            d[k] = v
    return d

def get_symbols_with_docstring(symbols, comments):
    comment_intervals_dict = {c[1][0] : c[1] for idx, c in enumerate(comments)}
    symbol_comments_pairs = []
    for s in symbols:
        if s["node_range"][0] in comment_intervals_dict:
            s["comment_range"] = comment_intervals_dict[s["node_range"][0]]
            symbol_comments_pairs.append(s)
    return symbol_comments_pairs

def get_code_and_comments(content, symbols):
    for s in symbols:
        comment_range = s["comment_range"]
        code_range = (s["comment_range"][1] + 1, s["node_range"][1])
        s["code"] = content[code_range[0]:code_range[1]]
        s["docstring"] = content[comment_range[0]:comment_range[1]]
    return symbols

In [252]:
tokio_df = df[df["repo_name"] == "tokio-rs/tokio"]
data = []
for i, row in df[df["stars"] > 1000].iterrows():
    # if row["path"] != "tokio/src/sync/barrier.rs":
    #     continue
    try:
        parse_tree = get_parse_tree(row["content"])
    except:
        print("Failed to parse " + row["filepath"])
        continue
    tokenized_parse_list = []
    for idx, s in enumerate(parse_tree.split("\n")):
        s = s.strip()
        if len(s) == 0:
            continue
        tokenized = parse_token(s)
        if tokenized[0] == "WHITESPACE":
            continue
        if idx > 0 and tokenized_parse_list[-1][0] == "COMMENT" and tokenized[0] == "COMMENT":
            tokenized_parse_list[-1] = (
                "COMMENT", 
                (tokenized_parse_list[-1][1][0], 
                tokenized[1][1])
            )
        else:
            tokenized_parse_list.append(tokenized)
    comments = list(filter(lambda t: t[0] == "COMMENT", tokenized_parse_list))
    symbols = get_symbol_table(row["content"]).split("\n")
    symbol_dict_list = []
    for sym in symbols:
        if len(sym) == 0:
            continue
        symbol_dict = parse_symbol(sym)
        symbol_dict["filename"] = row["path"]
        symbol_dict_list.append(symbol_dict)
    symbols = get_symbols_with_docstring(symbol_dict_list, comments)
    symbols = get_code_and_comments(row["content"], symbols)
    data.extend(symbols)
print(len(data))
    

KeyError: 'filepath'

In [247]:
data[1]


{'parent': None,
 'label': 'TiKVServer',
 'navigation_range': (4729, 4739),
 'node_range': (4694, 5509),
 'kind': 'struct',
 'detail': 'None',
 'deprecated': False,
 'filename': 'cmd/src/server.rs',
 'comment_range': (4694, 4721),
 'code': 'struct TiKVServer<ER: RaftEngine> {\n    config: TiKvConfig,\n    cfg_controller: Option<ConfigController>,\n    security_mgr: Arc<SecurityManager>,\n    pd_client: Arc<RpcClient>,\n    router: RaftRouter<RocksEngine, ER>,\n    system: Option<RaftBatchSystem<RocksEngine, ER>>,\n    resolver: resolve::PdStoreAddrResolver,\n    state: Arc<Mutex<GlobalReplicationState>>,\n    store_path: PathBuf,\n    encryption_key_manager: Option<Arc<DataKeyManager>>,\n    engines: Option<TiKVEngines<ER>>,\n    servers: Option<Servers<ER>>,\n    region_info_accessor: RegionInfoAccessor,\n    coprocessor_host: Option<CoprocessorHost<RocksEngine>>,\n    to_stop: Vec<Box<dyn Stop>>,\n    lock_files: Vec<File>,\n    concurrency_manager: ConcurrencyManager,\n    env: Arc<

In [235]:
with open("rust_1k_docstring_code.json", "w") as outfile:
    json.dump(data, outfile, indent=4)

In [243]:
len(df[df["stars"] > 1000])

15474

In [245]:
tokio_df["stars"]

63        12281
411       12281
788       12281
908       12281
1491      12281
          ...  
101193    12281
101521    12281
101565    12281
101724    12281
102163    12281
Name: stars, Length: 497, dtype: int64