In [1]:
import math
from collections import Counter
from typing import Any, Iterable

import pandas as pd
from scipy.stats import chi2
from statsmodels.miscmodels.ordinal_model import OrderedModel

In [2]:
SIMPLE_VOTE_MAP = {
    "SA": "A",
    "WA": "A",
    "A": "A",
    "E": "E",
    "B": "B",
    "WB": "B",
    "SB": "B",
}

LEVEL_VOTE_MAP = {
    "SA": "S",
    "WA": "W",
    "E": "E",
    "WB": "W",
    "SB": "S",
}


def isnan(x: Any) -> bool:
    if isinstance(x, float):
        return math.isnan(x)
    return False


def to_vote_counter(votes: Iterable[Any]) -> Counter[str] | None:
    if any(isnan(v) for v in votes):
        return None
    return Counter([SIMPLE_VOTE_MAP[v] for v in votes])  # type: ignore


def to_majority(votes: Counter[str] | None) -> str | None:
    if votes is None:
        return None
    value, count = votes.most_common(1)[0]
    if count / votes.total() > 0.5:
        return value
    return None


def to_noconflict(votes: Counter[str] | None) -> str | None:
    if votes is None:
        return None
    value, count = votes.most_common(1)[0]
    if count / votes.total() > 0.5:
        if votes["A"] == 0 or votes["B"] == 0:
            return value
    return None


def to_unanimous(votes: Counter[str] | None) -> str | None:
    if votes is None:
        return None
    value, count = votes.most_common(1)[0]
    if count / votes.total() == 1.0:
        return value
    return None

In [3]:
METRIC_NAMES = [
    "CDI",
    "LCOM1",
    "LCOM2",
    "LCOM3",
    "LCOM4",
    "Co",
    "TCC",
    "LCC",
    "LCOM5",
    "AAD(LSI-768-C)",
    "AAD(LSI-768-NC)",
    "AAD(D2V-768-C)",
    "AAD(D2V-768-NC)",
    "AAD(BERT)",
    "NC3(LSI-768-C)",
    "NC3(LSI-768-NC)",
    "NC3(D2V-768-C)",
    "NC3(D2V-768-NC)",
    "NC3(BERT)",
    "LCSM(LSI-768-C)",
    "LCSM(LSI-768-NC)",
    "LCSM(D2V-768-C)",
    "LCSM(D2V-768-NC)",
    "LCSM(BERT)",
    "LCOSM(LSI-768-C)",
    "LCOSM(LSI-768-NC)",
    "LCOSM(D2V-768-C)",
    "LCOSM(D2V-768-NC)",
    "LCOSM(BERT)",
]

METRIC_RENAMES = {
    "db_path": "proj",
    "filename": "name",
    "loc": "LOC",
    "lloc": "LLOC",
    "entities": "Entities",
    "commits": "Commits",
}

In [4]:
SEQUENCES_CSV = "_fileranker/sequences.csv"
RESPONSES_CSV = "_fileranker/responses.csv"
METRICS_CSV = "_data/metrics_test_part.csv"
USERNAMES = ["chipmunk", "jackal", "wildcat"]
SEQUENCE = "testset-ldl"

In [5]:
seq_df = pd.read_csv(SEQUENCES_CSV)
seq_df = seq_df.drop(columns=["content_a", "content_b"])
seq_df = seq_df.rename(columns={"project_a": "a_proj", "project_b": "b_proj"})
seq_df = seq_df.rename(columns={"filename_a": "a_name", "filename_b": "b_name"})
seq_df

Unnamed: 0,sequence,position,a_proj,b_proj,a_name,b_name
0,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...
1,testset-ldl,1,_data/dbs/ExtraCells/ExtraCells2.db,_data/dbs/trinodb/tempto.db,src/main/scala/extracells/item/ItemPartECBase....,tempto-core/src/main/java/io/trino/tempto/fulf...
2,testset-ldl,2,_data/dbs/deliciousblackink/Derpibooru.db,_data/dbs/SpigotMC/BungeeCord.db,app/src/main/java/derpibooru/derpy/server/pars...,proxy/src/main/java/net/md_5/bungee/Encryption...
3,testset-ldl,3,_data/dbs/raeleus/skin-composer.db,_data/dbs/tarunsinghofficial/HacktoberFest.db,core/src/com/ray3k/skincomposer/dialog/sceneco...,Cipher/BEAUFORT.java
4,testset-ldl,4,_data/dbs/JPDSousa/mongo-obj-framework.db,_data/dbs/opennetworkinglab/onos.db,src/main/java/org/smof/parsers/metadata/TypeSt...,apps/k8s-node/app/src/main/java/org/onosprojec...
...,...,...,...,...,...,...
2395,testset-ldl,2395,_data/dbs/agonyforge/arbitrader.db,_data/dbs/hyperledger/web3j-cli.db,src/main/java/com/agonyforge/arbitrader/servic...,src/main/java/org/web3j/console/project/templa...
2396,testset-ldl,2396,_data/dbs/Valandur/Web-API.db,_data/dbs/apache/servicecomb-toolkit.db,webapi-sponge/src/main/java/valandur/webapi/in...,toolkit-maven-plugin/src/main/java/org/apache/...
2397,testset-ldl,2397,_data/dbs/piranhacloud/piranha.db,_data/dbs/fair-acc/chart-fx.db,arquillian/jarcontainer/src/main/java/cloud/pi...,chartfx-chart/src/main/java/io/fair_acc/chartf...
2398,testset-ldl,2398,_data/dbs/apache/streams.db,_data/dbs/opensrp/opensrp-server-core.db,streams-config/src/main/java/org/apache/stream...,src/main/java/org/opensrp/service/multimedia/B...


In [6]:
res_df = pd.read_csv(RESPONSES_CSV)
res_df = res_df.drop(columns=["responded_on"])
res_df = res_df[res_df["sequence"] == SEQUENCE]
res_df = res_df[res_df["username"].isin(USERNAMES)]
res_df["value"] = res_df["value"].replace("U", "E")
res_df["level"] = [LEVEL_VOTE_MAP[v] for v in res_df["value"]]
res_df["sign"] = [SIMPLE_VOTE_MAP[v] for v in res_df["value"]]
res_df

Unnamed: 0,sequence,position,username,value,level,sign
0,testset-ldl,0,chipmunk,WB,W,B
1,testset-ldl,0,jackal,SA,S,A
2,testset-ldl,0,wildcat,WB,W,B
3,testset-ldl,1,chipmunk,WB,W,B
4,testset-ldl,1,jackal,SB,S,B
...,...,...,...,...,...,...
895,testset-ldl,298,jackal,E,E,E
896,testset-ldl,298,wildcat,WA,W,A
897,testset-ldl,299,chipmunk,WA,W,A
898,testset-ldl,299,jackal,WB,W,B


In [7]:
majority = res_df.groupby("position")["value"].apply(
    lambda x: to_majority(to_vote_counter(x))
)
noconflict = res_df.groupby("position")["value"].apply(
    lambda x: to_noconflict(to_vote_counter(x))
)
unanimous = res_df.groupby("position")["value"].apply(
    lambda x: to_unanimous(to_vote_counter(x))
)

print(f"Majority:    {(~majority.isna()).sum()}")
print(f"No Conflict: {(~noconflict.isna()).sum()}")
print(f"Unanimous:   {(~unanimous.isna()).sum()}")

Majority:    251
No Conflict: 160
Unanimous:   74


In [8]:
metrics_df = pd.read_csv(METRICS_CSV)
metrics_df = metrics_df.rename(columns=METRIC_RENAMES)
metrics_df

Unnamed: 0,proj,name,LOC,LLOC,Entities,Commits,is_large,is_change_prone,is_ldl,Members,...,LCOSM(LSI-768-NC),LCOSM(D2V-10-C),LCOSM(D2V-10-NC),LCOSM(D2V-64-C),LCOSM(D2V-64-NC),LCOSM(D2V-256-C),LCOSM(D2V-256-NC),LCOSM(D2V-768-C),LCOSM(D2V-768-NC),LCOSM(BERT)
0,_data/dbs/0999312/TofuCraftReload.db,src/main/java/cn/mcmod/tofucraft/ClientProxy.java,92,82,8,12,False,True,False,5,...,0.400000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
1,_data/dbs/0999312/TofuCraftReload.db,src/main/java/cn/mcmod/tofucraft/CommonProxy.java,87,69,9,30,False,True,False,7,...,0.200000,0.0,0.066667,0.066667,0.066667,0.0,0.066667,0.066667,0.0,0.2
2,_data/dbs/0999312/TofuCraftReload.db,src/main/java/cn/mcmod/tofucraft/CreativeTabsT...,20,13,4,1,False,False,False,2,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
3,_data/dbs/0999312/TofuCraftReload.db,src/main/java/cn/mcmod/tofucraft/RecipeLoader....,170,157,9,7,True,True,True,7,...,0.200000,0.2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
4,_data/dbs/0999312/TofuCraftReload.db,src/main/java/cn/mcmod/tofucraft/TofuConfig.java,25,20,5,2,False,False,False,3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149894,_data/dbs/yomguy/servestream.db,src/net/sourceforge/servestream/transport/Tran...,158,107,9,14,False,True,False,7,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
149895,_data/dbs/yomguy/servestream.db,src/net/sourceforge/servestream/utils/BackupUt...,245,188,12,12,True,True,True,9,...,0.047619,0.0,0.047619,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
149896,_data/dbs/yomguy/servestream.db,src/net/sourceforge/servestream/utils/Preferen...,42,20,17,41,False,True,False,15,...,,,,,,,,,,
149897,_data/dbs/yomguy/servestream.db,src/net/sourceforge/servestream/utils/UriBeanL...,152,64,11,4,False,False,False,9,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0


In [9]:
df = seq_df.merge(res_df, how="right", on=["sequence", "position"])
df

Unnamed: 0,sequence,position,a_proj,b_proj,a_name,b_name,username,value,level,sign
0,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...,chipmunk,WB,W,B
1,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...,jackal,SA,S,A
2,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...,wildcat,WB,W,B
3,testset-ldl,1,_data/dbs/ExtraCells/ExtraCells2.db,_data/dbs/trinodb/tempto.db,src/main/scala/extracells/item/ItemPartECBase....,tempto-core/src/main/java/io/trino/tempto/fulf...,chipmunk,WB,W,B
4,testset-ldl,1,_data/dbs/ExtraCells/ExtraCells2.db,_data/dbs/trinodb/tempto.db,src/main/scala/extracells/item/ItemPartECBase....,tempto-core/src/main/java/io/trino/tempto/fulf...,jackal,SB,S,B
...,...,...,...,...,...,...,...,...,...,...
895,testset-ldl,298,_data/dbs/open-telemetry/opentelemetry-java.db,_data/dbs/jd-opensource/vtdriver.db,exporters/prometheus/src/main/java/io/opentele...,src/main/java/com/jd/jdbc/context/VtCancelCont...,jackal,E,E,E
896,testset-ldl,298,_data/dbs/open-telemetry/opentelemetry-java.db,_data/dbs/jd-opensource/vtdriver.db,exporters/prometheus/src/main/java/io/opentele...,src/main/java/com/jd/jdbc/context/VtCancelCont...,wildcat,WA,W,A
897,testset-ldl,299,_data/dbs/LinuxForHealth/hl7v2-fhir-converter.db,_data/dbs/rebasing-xyz/rebot.db,src/main/java/io/github/linuxforhealth/hl7/HL7...,rebot-plugins/rebot-packt-free-learning-plugin...,chipmunk,WA,W,A
898,testset-ldl,299,_data/dbs/LinuxForHealth/hl7v2-fhir-converter.db,_data/dbs/rebasing-xyz/rebot.db,src/main/java/io/github/linuxforhealth/hl7/HL7...,rebot-plugins/rebot-packt-free-learning-plugin...,jackal,WB,W,B


In [10]:
a_metrics_df = df.merge(
    metrics_df, how="left", left_on=["a_proj", "a_name"], right_on=["proj", "name"]
)
b_metrics_df = df.merge(
    metrics_df, how="left", left_on=["b_proj", "b_name"], right_on=["proj", "name"]
)


def get_vote_diffs(metric: str, vote: str = "A") -> pd.Series:
    """Returns a series of values for a given a metric where larger
    values indicate a stronger preference for the given vote."""
    # Example:
    # File A has an LCOM1 of 3.
    # File B has an LCOM1 of 7.
    # So LCOM1 believes File B is less cohesive.
    # So if vote == "A", we get a positive value (b - a = 7 - 3 = 4)
    a = a_metrics_df[metric]
    b = b_metrics_df[metric]
    if vote == "A":
        return b - a
    if vote == "B":
        return a - b
    if vote == "E":
        return -1 * (a - b).abs()
    raise ValueError


has_nan_metric = pd.Series([False] * len(df))

for metric in METRIC_NAMES:
    has_nan_metric |= a_metrics_df[metric].isna()
    has_nan_metric |= b_metrics_df[metric].isna()

In [11]:
ordinal_map = {"SB": 0, "WB": 1, "E": 2, "WA": 3, "SA": 4}
df["ordinal"] = [ordinal_map.get(v) for v in df["value"]]

for metric in METRIC_NAMES:
    df[metric] = list(get_vote_diffs(metric))

In [12]:
print(len(df[~has_nan_metric]))
df[~has_nan_metric]

621


Unnamed: 0,sequence,position,a_proj,b_proj,a_name,b_name,username,value,level,sign,...,LCSM(LSI-768-C),LCSM(LSI-768-NC),LCSM(D2V-768-C),LCSM(D2V-768-NC),LCSM(BERT),LCOSM(LSI-768-C),LCOSM(LSI-768-NC),LCOSM(D2V-768-C),LCOSM(D2V-768-NC),LCOSM(BERT)
0,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...,chipmunk,WB,W,B,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...,jackal,SA,S,A,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,testset-ldl,0,_data/dbs/uPortal-Project/CalendarPortlet.db,_data/dbs/elastic/support-diagnostics.db,src/main/java/org/jasig/portlet/calendar/mvc/c...,src/main/java/co/elastic/support/diagnostics/c...,wildcat,WB,W,B,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
18,testset-ldl,6,_data/dbs/tebexio/BuycraftX.db,_data/dbs/rockbite/talos.db,common/src/main/java/net/buycraft/plugin/data/...,editor/src/com/talosvfx/talos/editor/utils/Cam...,chipmunk,WB,W,B,...,8.0,8.0,0.0,0.0,0.0,0.285714,0.285714,0.0,0.0,0.0
19,testset-ldl,6,_data/dbs/tebexio/BuycraftX.db,_data/dbs/rockbite/talos.db,common/src/main/java/net/buycraft/plugin/data/...,editor/src/com/talosvfx/talos/editor/utils/Cam...,jackal,E,E,E,...,8.0,8.0,0.0,0.0,0.0,0.285714,0.285714,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,testset-ldl,298,_data/dbs/open-telemetry/opentelemetry-java.db,_data/dbs/jd-opensource/vtdriver.db,exporters/prometheus/src/main/java/io/opentele...,src/main/java/com/jd/jdbc/context/VtCancelCont...,jackal,E,E,E,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
896,testset-ldl,298,_data/dbs/open-telemetry/opentelemetry-java.db,_data/dbs/jd-opensource/vtdriver.db,exporters/prometheus/src/main/java/io/opentele...,src/main/java/com/jd/jdbc/context/VtCancelCont...,wildcat,WA,W,A,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
897,testset-ldl,299,_data/dbs/LinuxForHealth/hl7v2-fhir-converter.db,_data/dbs/rebasing-xyz/rebot.db,src/main/java/io/github/linuxforhealth/hl7/HL7...,rebot-plugins/rebot-packt-free-learning-plugin...,chipmunk,WA,W,A,...,4.0,4.0,0.0,0.0,4.0,0.400000,0.400000,0.0,0.0,0.4
898,testset-ldl,299,_data/dbs/LinuxForHealth/hl7v2-fhir-converter.db,_data/dbs/rebasing-xyz/rebot.db,src/main/java/io/github/linuxforhealth/hl7/HL7...,rebot-plugins/rebot-packt-free-learning-plugin...,jackal,WB,W,B,...,4.0,4.0,0.0,0.0,4.0,0.400000,0.400000,0.0,0.0,0.4


In [13]:
results = []

for metric in METRIC_NAMES:
    print(f"Running on {metric}...")
    X = df[~has_nan_metric][[metric]]
    y = df[~has_nan_metric]["ordinal"]

    # Center and scale the metric
    X_scaled = (X - X.mean()) / X.std()

    # Fit the ordinal logistic regression model without the constant
    model = OrderedModel(y, X_scaled, distr="logit")
    res = model.fit(method="bfgs", disp=True)
    lr_stat = res.llr
    lr_p_value = res.llr_pvalue
    lower_ci = lr_stat - chi2.ppf(0.975, df=1) ** 0.5
    upper_ci = lr_stat + chi2.ppf(0.975, df=1) ** 0.5

    results.append(
        {
            "Metric": metric,
            "LR Test": lr_stat,
            "LR Test CI Lower": lower_ci,
            "LR Test CI Upper": upper_ci,
            "LR Test (p-value)": lr_p_value,
        }
    )

results_df = pd.DataFrame(results)
results_df

Running on CDI...
Optimization terminated successfully.
         Current function value: 1.214843
         Iterations: 16
         Function evaluations: 18
         Gradient evaluations: 18
Running on LCOM1...
Optimization terminated successfully.
         Current function value: 1.228413
         Iterations: 17
         Function evaluations: 19
         Gradient evaluations: 19
Running on LCOM2...
Optimization terminated successfully.
         Current function value: 1.230778
         Iterations: 4
         Function evaluations: 6
         Gradient evaluations: 6
Running on LCOM3...
Optimization terminated successfully.
         Current function value: 1.229725
         Iterations: 14
         Function evaluations: 16
         Gradient evaluations: 16
Running on LCOM4...
Optimization terminated successfully.
         Current function value: 1.230641
         Iterations: 9
         Function evaluations: 11
         Gradient evaluations: 11
Running on Co...
Optimization terminated succe

Unnamed: 0,Metric,LR Test,LR Test CI Lower,LR Test CI Upper,LR Test (p-value)
0,CDI,19.796666,17.555263,22.038069,9e-06
1,LCOM1,2.94285,0.701447,5.184253,0.086258
2,LCOM2,0.004618,-2.236784,2.246021,0.945818
3,LCOM3,1.312325,-0.929078,3.553727,0.251974
4,LCOM4,0.175152,-2.066251,2.416555,0.675573
5,Co,0.832612,-1.40879,3.074015,0.361518
6,TCC,0.011031,-2.230372,2.252433,0.916354
7,LCC,0.061306,-2.180097,2.302708,0.804444
8,LCOM5,9.03825,6.796848,11.279653,0.002644
9,AAD(LSI-768-C),13.692542,11.45114,15.933945,0.000215


In [14]:
results_df.to_csv("_data/results.csv", index=False)