In [14]:
from typing import List, Dict
import tiktoken
import json
import pandas as pd

def get_context_length(entry: Dict, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(entry["context_str"]))
    return num_tokens


def get_source_types(entry: Dict) -> List[str]:
    context = entry["context"]
    source_types = [x["index"] for x in context]
    return source_types


def get_df_stats(model_name: str, config_str: str):

    data_file = f"../data/results/{config_str}/all_dependencies_all_{model_name}.json"
    baseline_file = "../data/results/all_dependencies.csv"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    response_rating = []
    classification = []
    rationale = []
    plan = []
    uncertainty = []
    context_length = []
    source_types = []
    skipped = 0
    
    for entry, (index, row) in zip(data, df.iterrows()):

        assert entry["index"] == index
        
        response_dict = None
        isDependency = None

        context_length.append(get_context_length(entry=entry))
        source_types.append(get_source_types(entry=entry))

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response, strict=False)
            isDependency = response_dict["isDependency"]
            plan_str = response_dict["plan"]
            rationale_str = response_dict["rationale"]
            uncertainty_str = response_dict["uncertainty"]
        except (json.JSONDecodeError, KeyError):
            response_rating.append("None")
            classification.append("None")
            plan.append("None")
            rationale.append("None")
            uncertainty.append("None")
            skipped += 1
            continue

        rating = row["final_rating"]
        
        if str(rating).lower() == "borderline":
            if isDependency:
                response_rating.append(str(isDependency))
                classification.append("TP")
                plan.append(plan_str)
                rationale.append(rationale_str)
                uncertainty.append(uncertainty_str)
            if not isDependency:
                response_rating.append(str(isDependency))
                classification.append("TN")
                plan.append(plan_str)
                rationale.append(rationale_str)
                uncertainty.append(uncertainty_str)

        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("FP")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)


        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("FN")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)

        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("TP")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("TN")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)


    print(f"Skipped {skipped} entries in file {data_file}.")
    df[f"response_rating"] = response_rating
    df["classification"] = classification
    df["plan"] = plan
    df["rationale"] = rationale
    df["uncertainty"] = uncertainty
    df["context_length"] = context_length
    df["source_types"] = source_types

    return df

model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b", "llama3.1:8b"] # "llama3.1:70b"
config_str = "config4"

for name in model_names:
    df = get_df_stats(model_name=name, config_str=config_str)
    print(df.columns)
    df.to_csv(f"../data/analysis/{config_str}/{name}.csv", index=False)

Skipped 0 entries in file ../data/results/config4/all_dependencies_all_gpt-3.5-turbo-0125.json.
Index(['dependency_category', 'link_str', 'project', 'option_name',
       'option_value', 'option_type', 'option_file', 'option_technology',
       'dependent_option_name', 'dependent_option_value',
       'dependent_option_type', 'dependent_option_file',
       'dependent_option_technology', 'config_type', 'rating', 'final_rating',
       'final_category', 'category', 'explanation', 'response_rating',
       'classification', 'plan', 'rationale', 'uncertainty', 'context_length',
       'source_types'],
      dtype='object')
Skipped 0 entries in file ../data/results/config4/all_dependencies_all_gpt-4o-2024-05-13.json.
Index(['dependency_category', 'link_str', 'project', 'option_name',
       'option_value', 'option_type', 'option_file', 'option_technology',
       'dependent_option_name', 'dependent_option_value',
       'dependent_option_type', 'dependent_option_file',
       'dependent_op

In [3]:
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'

df_base = pd.read_csv("../data/results/all_dependencies.csv")

for name in model_names:
    file_name = f"../data/analysis/{config_str}/{name}.csv"
    df_model = pd.read_csv(file_name)

    classification = df_model["classification"].to_list()

    df_base[f"{name}_classification"] = classification


df_base.to_csv(f"../data/analysis/{config_str}/all.csv", index=False)

print(len(df_base))


500


In [4]:
import pandas as pd

df = pd.read_csv(f"../data/analysis/{config_str}/all.csv")

# List of model columns to check
model_columns = ['gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:70b_classification', 'llama3:8b_classification', 'llama3.1:8b_classification'] # 'llama3.1:70b_classification'

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
#mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
#filtered_df_all = df[mask]

df['FP_FN_count'] = df[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)
df['TP_TN_count'] = df[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
df.to_csv(f'../data/analysis/{config_str}/all.csv', index=False)

In [5]:
import pandas as pd
from collections import Counter

df_all = pd.read_csv(f'../data/analysis/{config_str}/all.csv')

failure_count = df_all["FP_FN_count"].to_list()
success_count = df_all["TP_TN_count"].to_list()

failure_counter = Counter(failure_count)
success_counter = Counter(success_count)

print(len(df_all))

print("Failure counter: ", failure_counter)
print("Success counter: ", success_counter)

500
Failure counter:  Counter({1: 137, 2: 133, 0: 112, 3: 72, 4: 39, 5: 7})
Success counter:  Counter({3: 146, 4: 141, 2: 87, 5: 66, 1: 48, 0: 12})


**Check if failure are identical across RAG variants**

In [6]:
import pandas as pd

df = pd.read_csv(f"../data/analysis/{config_str}/all.csv")

# List of model columns to check
model_columns = ['gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:70b_classification', 'llama3:8b_classification', 'llama3.1:8b_classification'] # 'llama3.1:70b_classification'

for column in model_columns:
    rating_counts = df[column].value_counts()

    print(column)
    print(rating_counts)
    print(sum(k for k in rating_counts.to_dict().values()))

gpt-4o-2024-05-13_classification
gpt-4o-2024-05-13_classification
TN    269
TP    115
FN     88
FP     28
Name: count, dtype: int64
500
gpt-3.5-turbo-0125_classification
gpt-3.5-turbo-0125_classification
TN    170
TP    144
FP    127
FN     59
Name: count, dtype: int64
500
llama3:70b_classification
llama3:70b_classification
TN    189
TP    149
FP    108
FN     54
Name: count, dtype: int64
500
llama3:8b_classification
llama3:8b_classification
FP    139
TP    133
TN     96
FN     27
Name: count, dtype: int64
395
llama3.1:8b_classification
llama3.1:8b_classification
TN    204
FN    104
TP     85
FP     76
Name: count, dtype: int64
469


In [7]:
import pandas as pd
from collections import Counter

baseline_file = "../data/results/all_dependencies.csv"

df = pd.read_csv(baseline_file)

categories = df["final_category"].to_list()

categories = [x.lower() for x in categories]

category_counter = Counter(categories)

category_counter

Counter({'maven dependency': 91,
         'project inheritance': 82,
         'boolean': 57,
         'others': 48,
         'version numbers': 44,
         'generic names': 29,
         'maven properties': 22,
         'independent components': 20,
         'datasource': 19,
         'port': 16,
         'maven plugin': 15,
         'project structure': 11,
         'packaging format': 11,
         'project aggregation': 10,
         'module as dependency': 10,
         'maven plugins': 9,
         'integers': 6})

In [8]:
from typing import List
import pandas as pd

def get_category_stats(category_names: List, model_name: str):

    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    df = pd.read_csv(file_name)

    tp = []
    fp = []
    tn = []
    fn = []

    for category_name in category_names:

        df_category = df[df['final_category'].str.lower() == str(category_name)]

        #print(df_category["classification"].unique())

        rating_counts = df_category['classification'].value_counts().to_dict()

        #print(type(rating_counts))
        #print(df_category)
        #print(rating_counts)

        tp.append(rating_counts["TP"] if "TP" in rating_counts else "-")
        tn.append(rating_counts["TN"] if "TN" in rating_counts else "-" )
        fp.append(rating_counts["FP"] if "FP" in rating_counts else "-")
        fn.append(rating_counts["FN"] if "FN" in rating_counts else "-")

    
    data = {
        "category_name": category_names,
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    }

    df = pd.DataFrame(data)

    print(df)

    #return df


model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'
category_names = list(category_counter.keys())

for model_name in model_names:
    print("Model: ", model_name)
    get_category_stats(category_names=category_names, model_name=model_name)


Model:  gpt-4o-2024-05-13
             category_name  TP  TN FP  FN
0          version numbers   6  10  9  19
1                  boolean   -  57  -   -
2      project inheritance  32  11  5  34
3         maven properties  18   -  -   4
4                   others   8  36  3   1
5      project aggregation  10   -  -   -
6         maven dependency  18  51  3  19
7        project structure   5   3  -   3
8            maven plugins   1   6  1   1
9             maven plugin   1  11  -   3
10                    port   5   8  3   -
11    module as dependency   9   -  -   1
12           generic names   -  27  2   -
13        packaging format   -  11  -   -
14              datasource   2  14  -   3
15  independent components   -  19  1   -
16                integers   -   5  1   -
Model:  gpt-3.5-turbo-0125
             category_name  TP  TN  FP  FN
0          version numbers  19   4  15   6
1                  boolean   -  53   4   -
2      project inheritance  41  11   5  25
3         maven pro

**Which technologies are involved in dependencies that are incorrectly validated?**

In [9]:
from collections import Counter
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'

data = []

for model_name in model_names:
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    df_model = pd.read_csv(file_name)

    df_failures = df_model[df_model["classification"].isin(["FP", "FN"])]

    technologies_failures = df_failures["option_technology"].to_list() 

    counter = Counter(technologies_failures)

    print("Technologies Failures: ", counter)


    model_data = {"model_name": model_name}
    model_data.update({k: v for k, v in counter.items()})
    data.append(model_data)


df = pd.DataFrame(data)
df


Model:  gpt-4o-2024-05-13
Technologies Failures:  Counter({'maven': 101, 'spring': 7, 'docker-compose': 5, 'nodejs': 3})
Model:  gpt-3.5-turbo-0125
Technologies Failures:  Counter({'maven': 118, 'spring': 30, 'docker-compose': 23, 'nodejs': 8, 'docker': 7})
Model:  llama3:70b
Technologies Failures:  Counter({'maven': 117, 'docker-compose': 18, 'spring': 15, 'nodejs': 6, 'docker': 5, 'tsconfig': 1})
Model:  llama3:8b
Technologies Failures:  Counter({'maven': 103, 'spring': 27, 'docker-compose': 16, 'nodejs': 9, 'docker': 7, 'tsconfig': 4})
Model:  llama3.1:8b
Technologies Failures:  Counter({'maven': 141, 'docker-compose': 17, 'spring': 14, 'nodejs': 6, 'docker': 2})


Unnamed: 0,model_name,maven,spring,nodejs,docker-compose,docker,tsconfig
0,gpt-4o-2024-05-13,101,7,3,5,,
1,gpt-3.5-turbo-0125,118,30,8,23,7.0,
2,llama3:70b,117,15,6,18,5.0,1.0
3,llama3:8b,103,27,9,16,7.0,4.0
4,llama3.1:8b,141,14,6,17,2.0,


**Distribution of Intra- and Cross-Technogy Failures**

In [10]:
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'


for model_name in model_names:
    intra = 0
    cross = 0
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    df_model = pd.read_csv(file_name)

    df_failures = df_model[df_model["classification"].isin(["FP", "FN"])]

    technologies_a = df_failures["option_technology"].to_list() 
    technologies_b = df_failures["dependent_option_technology"].to_list() 



    for x, y in zip(technologies_a, technologies_b):
        if x.lower() == y.lower():
            intra += 1
        else:
            cross += 1
    
    print("Num intra-technology failures: ", intra)
    print("Num cross-technology failures: ", cross)


Model:  gpt-4o-2024-05-13
Num intra-technology failures:  116
Num cross-technology failures:  0
Model:  gpt-3.5-turbo-0125
Num intra-technology failures:  179
Num cross-technology failures:  7
Model:  llama3:70b
Num intra-technology failures:  157
Num cross-technology failures:  5
Model:  llama3:8b
Num intra-technology failures:  157
Num cross-technology failures:  9
Model:  llama3.1:8b
Num intra-technology failures:  172
Num cross-technology failures:  8


**Context length of correctly and incorrectly classified dependencies**

In [29]:
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'
config_str = "config1"



for model_name in model_names:
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    
    df_model = pd.read_csv(file_name)
    df_false = df_model[df_model["classification"].isin(["FP", "FN"])]
    df_true = df_model[df_model["classification"].isin(["TP", "TN"])]

    context_length_false = df_false["context_length"].to_list()
    context_length_true = df_true["context_length"].to_list()

    context_length_false_avg = round(int(sum([x for x in context_length_false]))/len(context_length_false),2)
    context_length_true_avg = round(int(sum([x for x in context_length_true]))/len(context_length_true),2)

    data.append({
        "model_name": model_name,
        "context_length_false": context_length_false_avg,
        "context_length_true": context_length_true_avg
    })
    

df = pd.DataFrame(data)
df

Model:  gpt-4o-2024-05-13
Sum context length (FP and FN):  1387.22
116
Avg. context length (TP and TN):  1361.08
384
Model:  gpt-3.5-turbo-0125
Sum context length (FP and FN):  1375.94
186
Avg. context length (TP and TN):  1361.94
314
Model:  llama3:70b
Sum context length (FP and FN):  1372.69
162
Avg. context length (TP and TN):  1364.49
338
Model:  llama3:8b
Sum context length (FP and FN):  1368.95
166
Avg. context length (TP and TN):  1364.38
229
Model:  llama3.1:8b
Sum context length (FP and FN):  1380.88
180
Avg. context length (TP and TN):  1364.31
289


Unnamed: 0,model_name,context_length_false,context_length_true
0,gpt-4o-2024-05-13,1387.22,1361.08
1,gpt-3.5-turbo-0125,1375.94,1361.94
2,llama3:70b,1372.69,1364.49
3,llama3:8b,1368.95,1364.38
4,llama3.1:8b,1380.88,1364.31


**Distribution of source types**

In [35]:
from collections import Counter
import pandas as pd
import ast

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'
config_str = "config1"


for model_name in model_names:
    print("Config: ", config_str)
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    
    df_model = pd.read_csv(file_name)
    df_false = df_model[df_model["classification"].isin(["FP", "FN"])]
    df_true = df_model[df_model["classification"].isin(["TP", "TN"])]

    source_types_false = df_false["source_types"].to_list()
    source_types_true = df_true["source_types"].to_list()
   
    # Convert each string representation of the list into an actual list
    data_false = [ast.literal_eval(item) for item in source_types_false]
    data_true = [ast.literal_eval(item) for item in source_types_true]

    # Flatten the list of lists into a single list
    flattened_list_false = [context for sublist in data_false for context in sublist]
    flattened_list_true = [context for sublist in data_true for context in sublist]

    # Count the occurrences of each context source
    counter_false = Counter(flattened_list_false)
    counter_true = Counter(flattened_list_true)

    # Calculate the distribution
    total_false = sum(counter_false.values())
    distribution_false = {k: v / total_false for k, v in counter_false.items()}

    total_true = sum(counter_true.values())
    distribution_true = {k: v / total_true for k, v in counter_true.items()}


    # Display the counts and distribution
    print("Source Types Counts False:", counter_false)
    print("Source Types Distribution False:", distribution_false)

    print("Source Types Counts True:", counter_true)
    print("Source Types Distribution true:", distribution_true)

    break

#df = pd.DataFrame(data)
#df

Config:  config1
Model:  gpt-4o-2024-05-13
Source Types Counts False: Counter({'web-search': 426, 'tech-docs': 64, 'so-posts': 47, 'github': 18})
Source Types Distribution False: {'web-search': 0.7675675675675676, 'tech-docs': 0.11531531531531532, 'github': 0.032432432432432434, 'so-posts': 0.08468468468468468}
Average occurrences per data point False: {'web-search': 1.095115681233933, 'tech-docs': 0.16452442159383032, 'github': 0.04627249357326478, 'so-posts': 0.12082262210796915}
Source Types Counts True: Counter({'web-search': 1425, 'so-posts': 238, 'tech-docs': 223, 'github': 59})
Source Types Distribution true: {'github': 0.030334190231362468, 'so-posts': 0.12236503856041131, 'tech-docs': 0.11465295629820052, 'web-search': 0.7326478149100257}
Average occurrences per data point True: {'github': 0.15167095115681234, 'so-posts': 0.6118251928020566, 'tech-docs': 0.5732647814910026, 'web-search': 3.6632390745501286}
