In [1]:
from typing import List, Dict
import tiktoken
import json
import pandas as pd

def get_context_length(entry: Dict, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(entry["context_str"]))
    return num_tokens


def get_source_types(entry: Dict) -> List[str]:
    context = entry["context"]
    source_types = [x["index"] for x in context]
    return source_types


def get_df_stats(model_name: str, config_str: str):

    data_file = f"../data/results/{config_str}/all_dependencies_all_{model_name}.json"
    baseline_file = "../data/results/all_dependencies.csv"

    with open(data_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    df = pd.read_csv(baseline_file)

    response_rating = []
    classification = []
    rationale = []
    plan = []
    uncertainty = []
    context_length = []
    source_types = []
    skipped = 0
    
    for entry, (index, row) in zip(data, df.iterrows()):

        assert entry["index"] == index
        
        response_dict = None
        isDependency = None

        context_length.append(get_context_length(entry=entry))
        source_types.append(get_source_types(entry=entry))

        try:
            if "responses" in entry:
                response = entry["responses"][0]
            else:
                response = entry["response"]
            response_dict = json.loads(response, strict=False)
            isDependency = response_dict["isDependency"]
            plan_str = response_dict["plan"]
            rationale_str = response_dict["rationale"]
            uncertainty_str = response_dict["uncertainty"]
        except (json.JSONDecodeError, KeyError):
            response_rating.append("None")
            classification.append("None")
            plan.append("None")
            rationale.append("None")
            uncertainty.append("None")
            skipped += 1
            continue

        rating = row["final_rating"]
        
        if str(rating).lower() == "borderline":
            if isDependency:
                response_rating.append(str(isDependency))
                classification.append("TP")
                plan.append(plan_str)
                rationale.append(rationale_str)
                uncertainty.append(uncertainty_str)
            if not isDependency:
                response_rating.append(str(isDependency))
                classification.append("TN")
                plan.append(plan_str)
                rationale.append(rationale_str)
                uncertainty.append(uncertainty_str)

        # FP: The LLM validates a dependency as correct, but the dependency is actually incorrect
        if isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("FP")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)


        # FN: The LLM validates a dependency as incorrect, but the dependency is actually correct
        if not isDependency and  str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("FN")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)

        # TP: The LLM validates a dependency as correct and the dependency is correct
        if isDependency and str(rating).lower() == "true":
            response_rating.append(str(isDependency))
            classification.append("TP")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)

        # TN: The LLM validates a dependency as incorrect and the dependency is incorrect
        if not isDependency and str(rating).lower() == "false":
            response_rating.append(str(isDependency))
            classification.append("TN")
            plan.append(plan_str)
            rationale.append(rationale_str)
            uncertainty.append(uncertainty_str)


    print(f"Skipped {skipped} entries in file {data_file}.")
    df[f"response_rating"] = response_rating
    df["classification"] = classification
    df["plan"] = plan
    df["rationale"] = rationale
    df["uncertainty"] = uncertainty
    df["context_length"] = context_length
    df["source_types"] = source_types

    return df

model_names = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13", "llama3:8b", "llama3:70b", "llama3.1:8b"] # "llama3.1:70b"
config_str = "config2"

for name in model_names:
    df = get_df_stats(model_name=name, config_str=config_str)
    print(df.columns)
    df.to_csv(f"../data/analysis/{config_str}/{name}.csv", index=False)

Skipped 0 entries in file ../data/results/config2/all_dependencies_all_gpt-3.5-turbo-0125.json.
Index(['dependency_category', 'link_str', 'project', 'option_name',
       'option_value', 'option_type', 'option_file', 'option_technology',
       'dependent_option_name', 'dependent_option_value',
       'dependent_option_type', 'dependent_option_file',
       'dependent_option_technology', 'config_type', 'rating', 'final_rating',
       'final_category', 'sub_category', 'explanation', 'response_rating',
       'classification', 'plan', 'rationale', 'uncertainty', 'context_length',
       'source_types'],
      dtype='object')
Skipped 0 entries in file ../data/results/config2/all_dependencies_all_gpt-4o-2024-05-13.json.
Index(['dependency_category', 'link_str', 'project', 'option_name',
       'option_value', 'option_type', 'option_file', 'option_technology',
       'dependent_option_name', 'dependent_option_value',
       'dependent_option_type', 'dependent_option_file',
       'dependen

In [2]:
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'

df_base = pd.read_csv("../data/results/all_dependencies.csv")

for name in model_names:
    file_name = f"../data/analysis/{config_str}/{name}.csv"
    df_model = pd.read_csv(file_name)

    classification = df_model["classification"].to_list()

    df_base[f"{name}_classification"] = classification


df_base.to_csv(f"../data/analysis/{config_str}/all.csv", index=False)

print(len(df_base))


500


In [1]:
from collections import Counter
import matplotlib.pyplot as plt

model_columns = ['gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:70b_classification', 'llama3:8b_classification']

failure_ids = set()


for name in model_names:
    file_name = f"../data/analysis/{config_str}/{name}.csv"
    df_model = pd.read_csv(file_name)

    for index, row in df_model.iterrows():
        if row["classification"] == "FP" or row["classification"] == "FN":
            failure_ids.add(index)


df_failures = df_base.iloc[list(failure_ids)]
df_failures['FP_FN_count'] = df_failures[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)

print(len(df_failures))

df_failures.to_csv(f"../data/analysis/{config_str}/failures.csv", index=False)


categories = df_failures["final_category"].to_list()
sub_categories = df_failures["sub_category"].to_list()

category_counter = Counter(categories)
sub_category_counter = Counter(sub_categories)

print(category_counter)
print(sub_category_counter)

plt.figure(figsize=(25, 5)) 
plt.bar(category_counter.keys(), category_counter.values())
plt.show()

NameError: name 'model_names' is not defined

In [4]:
import pandas as pd

df = pd.read_csv(f"../data/analysis/{config_str}/all.csv")

# List of model columns to check
model_columns = ['gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:70b_classification', 'llama3:8b_classification'] # 'llama3.1:70b_classification'

# Create a boolean mask where True indicates the presence of 'FP' or 'FN' in any of the specified model columns
#mask = df[model_columns].apply(lambda x: x.isin(['FP', 'FN']), axis=1).any(axis=1)

# Filter the dataframe using the mask
#filtered_df_all = df[mask]

df['FP_FN_count'] = df[model_columns].apply(lambda row: row.isin(['FP', 'FN']).sum(), axis=1)
df['TP_TN_count'] = df[model_columns].apply(lambda row: row.isin(['TP', 'TN']).sum(), axis=1)

#sorted_df_all =  filtered_df_all.sort_values(by='FP_FN_count', ascending=False)

# Save the filtered dataframe if needed
df.to_csv(f'../data/analysis/{config_str}/all.csv', index=False)

In [5]:
import pandas as pd
from collections import Counter

df_all = pd.read_csv(f'../data/analysis/{config_str}/all.csv')

failure_count = df_all["FP_FN_count"].to_list()
success_count = df_all["TP_TN_count"].to_list()

failure_counter = Counter(failure_count)
success_counter = Counter(success_count)

print(len(df_all))

print("Failure counter: ", failure_counter)
print("Success counter: ", success_counter)

500
Failure counter:  Counter({1: 132, 0: 113, 2: 110, 3: 81, 4: 51, 5: 13})
Success counter:  Counter({4: 130, 3: 118, 2: 100, 5: 82, 1: 57, 0: 13})


**Check if failure are identical across RAG variants**

In [6]:
import pandas as pd

df = pd.read_csv(f"../data/analysis/{config_str}/all.csv")

# List of model columns to check
model_columns = ['gpt-4o-2024-05-13_classification', 'gpt-3.5-turbo-0125_classification', 'llama3:70b_classification', 'llama3:8b_classification', 'llama3.1:8b_classification'] # 'llama3.1:70b_classification'

for column in model_columns:
    rating_counts = df[column].value_counts()

    print(column)
    print(rating_counts)
    print(sum(k for k in rating_counts.to_dict().values()))

gpt-4o-2024-05-13_classification
gpt-4o-2024-05-13_classification
TN    267
TP    117
FN     86
FP     30
Name: count, dtype: int64
500
gpt-3.5-turbo-0125_classification
gpt-3.5-turbo-0125_classification
TN    160
TP    160
FP    137
FN     43
Name: count, dtype: int64
500
llama3:70b_classification
llama3:70b_classification
TN    193
TP    159
FP    104
FN     44
Name: count, dtype: int64
500
llama3:8b_classification
llama3:8b_classification
FP    170
TP    160
TN     63
FN     15
Name: count, dtype: int64
408
llama3.1:8b_classification
llama3.1:8b_classification
FP    172
TP    140
TN    122
FN     63
Name: count, dtype: int64
497


In [7]:
import pandas as pd
from collections import Counter

baseline_file = "../data/results/all_dependencies.csv"

df = pd.read_csv(baseline_file)

categories = df["final_category"].to_list()

categories = [x.lower() for x in categories]

category_counter = Counter(categories)

category_counter

Counter({'multi-maven-modules': 146,
         'libraries': 115,
         'boolean': 57,
         'others': 48,
         'version numbers': 44,
         'generic names': 29,
         'independent components': 20,
         'datasource': 19,
         'port': 16,
         'integers': 6})

In [8]:
from typing import List
import pandas as pd

def get_category_stats(category_names: List, model_name: str):

    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    df = pd.read_csv(file_name)

    tp = []
    fp = []
    tn = []
    fn = []

    for category_name in category_names:

        df_category = df[df['final_category'].str.lower() == str(category_name)]

        #print(df_category["classification"].unique())

        rating_counts = df_category['classification'].value_counts().to_dict()

        #print(type(rating_counts))
        #print(df_category)
        #print(rating_counts)

        tp.append(rating_counts["TP"] if "TP" in rating_counts else "-")
        tn.append(rating_counts["TN"] if "TN" in rating_counts else "-" )
        fp.append(rating_counts["FP"] if "FP" in rating_counts else "-")
        fn.append(rating_counts["FN"] if "FN" in rating_counts else "-")

    
    data = {
        "category_name": category_names,
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    }

    df = pd.DataFrame(data)

    print(df)

    #return df


model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'
category_names = list(category_counter.keys())

for model_name in model_names:
    print("Model: ", model_name)
    get_category_stats(category_names=category_names, model_name=model_name)


Model:  gpt-4o-2024-05-13
            category_name  TP  TN FP  FN
0         version numbers   8  11  8  17
1                 boolean   -  56  1   -
2     multi-maven-modules  74  25  5  42
3                  others   8  36  3   1
4               libraries  22  67  5  21
5                    port   4   5  6   1
6           generic names   -  27  2   -
7              datasource   1  14  -   4
8  independent components   -  20  -   -
9                integers   -   6  -   -
Model:  gpt-3.5-turbo-0125
            category_name  TP  TN  FP  FN
0         version numbers  22   4  15   3
1                 boolean   -  54   3   -
2     multi-maven-modules  85   8  22  31
3                  others   7  23  16   2
4               libraries  39  34  38   4
5                    port   4   3   8   1
6           generic names   -  21   8   -
7              datasource   3   7   7   2
8  independent components   -   3  17   -
9                integers   -   3   3   -
Model:  llama3:70b
            cat

**Which technologies are involved in dependencies that are incorrectly validated?**

In [9]:
from collections import Counter
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'

data = []

for model_name in model_names:
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    df_model = pd.read_csv(file_name)

    df_failures = df_model[df_model["classification"].isin(["FP", "FN"])]

    technologies_failures = df_failures["option_technology"].to_list() 

    counter = Counter(technologies_failures)

    print("Technologies Failures: ", counter)


    model_data = {"model_name": model_name}
    model_data.update({k: v for k, v in counter.items()})
    data.append(model_data)


df = pd.DataFrame(data)
df


Model:  gpt-4o-2024-05-13
Technologies Failures:  Counter({'maven': 99, 'docker-compose': 9, 'spring': 7, 'nodejs': 1})
Model:  gpt-3.5-turbo-0125
Technologies Failures:  Counter({'maven': 115, 'spring': 25, 'docker-compose': 19, 'docker': 12, 'nodejs': 8, 'tsconfig': 1})
Model:  llama3:70b
Technologies Failures:  Counter({'maven': 106, 'spring': 18, 'docker-compose': 13, 'nodejs': 8, 'docker': 3})
Model:  llama3:8b
Technologies Failures:  Counter({'maven': 114, 'spring': 32, 'docker-compose': 13, 'nodejs': 10, 'tsconfig': 8, 'docker': 8})
Model:  llama3.1:8b
Technologies Failures:  Counter({'maven': 156, 'spring': 34, 'docker-compose': 23, 'docker': 10, 'nodejs': 8, 'tsconfig': 4})


Unnamed: 0,model_name,maven,spring,docker-compose,nodejs,docker,tsconfig
0,gpt-4o-2024-05-13,99,7,9,1,,
1,gpt-3.5-turbo-0125,115,25,19,8,12.0,1.0
2,llama3:70b,106,18,13,8,3.0,
3,llama3:8b,114,32,13,10,8.0,8.0
4,llama3.1:8b,156,34,23,8,10.0,4.0


**Distribution of Intra- and Cross-Technogy Failures**

In [10]:
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'


for model_name in model_names:
    intra = 0
    cross = 0
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    df_model = pd.read_csv(file_name)

    df_failures = df_model[df_model["classification"].isin(["FP", "FN"])]

    technologies_a = df_failures["option_technology"].to_list() 
    technologies_b = df_failures["dependent_option_technology"].to_list() 



    for x, y in zip(technologies_a, technologies_b):
        if x.lower() == y.lower():
            intra += 1
        else:
            cross += 1
    
    print("Num intra-technology failures: ", intra)
    print("Num cross-technology failures: ", cross)


Model:  gpt-4o-2024-05-13
Num intra-technology failures:  114
Num cross-technology failures:  2
Model:  gpt-3.5-turbo-0125
Num intra-technology failures:  174
Num cross-technology failures:  6
Model:  llama3:70b
Num intra-technology failures:  144
Num cross-technology failures:  4
Model:  llama3:8b
Num intra-technology failures:  174
Num cross-technology failures:  11
Model:  llama3.1:8b
Num intra-technology failures:  220
Num cross-technology failures:  15


**Context length of correctly and incorrectly classified dependencies**

In [11]:
import pandas as pd

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'
config_str = "config1"



for model_name in model_names:
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    
    df_model = pd.read_csv(file_name)
    df_false = df_model[df_model["classification"].isin(["FP", "FN"])]
    df_true = df_model[df_model["classification"].isin(["TP", "TN"])]

    context_length_false = df_false["context_length"].to_list()
    context_length_true = df_true["context_length"].to_list()

    context_length_false_avg = round(int(sum([x for x in context_length_false]))/len(context_length_false),2)
    context_length_true_avg = round(int(sum([x for x in context_length_true]))/len(context_length_true),2)

    data.append({
        "model_name": model_name,
        "context_length_false": context_length_false_avg,
        "context_length_true": context_length_true_avg
    })
    

df = pd.DataFrame(data)
df

Model:  gpt-4o-2024-05-13
Model:  gpt-3.5-turbo-0125
Model:  llama3:70b
Model:  llama3:8b
Model:  llama3.1:8b


Unnamed: 0,model_name,maven,spring,docker-compose,nodejs,docker,tsconfig,context_length_false,context_length_true
0,gpt-4o-2024-05-13,99.0,7.0,9.0,1.0,,,,
1,gpt-3.5-turbo-0125,115.0,25.0,19.0,8.0,12.0,1.0,,
2,llama3:70b,106.0,18.0,13.0,8.0,3.0,,,
3,llama3:8b,114.0,32.0,13.0,10.0,8.0,8.0,,
4,llama3.1:8b,156.0,34.0,23.0,8.0,10.0,4.0,,
5,gpt-4o-2024-05-13,,,,,,,2301.68,2262.04
6,gpt-3.5-turbo-0125,,,,,,,2275.27,2268.09
7,llama3:70b,,,,,,,2303.52,2255.89
8,llama3:8b,,,,,,,2257.9,2276.29
9,llama3.1:8b,,,,,,,2278.44,2264.32


**Distribution of source types**

In [12]:
from collections import Counter
import pandas as pd
import ast

model_names = ['gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'llama3:70b', 'llama3:8b', 'llama3.1:8b'] # 'llama3.1:70b'
config_str = "config4"


for model_name in model_names:
    print("Config: ", config_str)
    print("Model: ", model_name)
    file_name = f"../data/analysis/{config_str}/{model_name}.csv"
    
    df_model = pd.read_csv(file_name)
    df_false = df_model[df_model["classification"].isin(["FP", "FN"])]
    df_true = df_model[df_model["classification"].isin(["TP", "TN"])]

    source_types_false = df_false["source_types"].to_list()
    source_types_true = df_true["source_types"].to_list()
   
    # Convert each string representation of the list into an actual list
    data_false = [ast.literal_eval(item) for item in source_types_false]
    data_true = [ast.literal_eval(item) for item in source_types_true]

    # Flatten the list of lists into a single list
    flattened_list_false = [context for sublist in data_false for context in sublist]
    flattened_list_true = [context for sublist in data_true for context in sublist]

    # Count the occurrences of each context source
    counter_false = Counter(flattened_list_false)
    counter_true = Counter(flattened_list_true)

    # Calculate the distribution
    total_false = sum(counter_false.values())
    distribution_false = {k: v / total_false for k, v in counter_false.items()}

    total_true = sum(counter_true.values())
    distribution_true = {k: v / total_true for k, v in counter_true.items()}


    # Display the counts and distribution
    print("Source Types Counts False:", counter_false)
    print("Source Types Distribution False:", distribution_false)

    print("Source Types Counts True:", counter_true)
    print("Source Types Distribution true:", distribution_true)

    break

#df = pd.DataFrame(data)
#df

Config:  config4
Model:  gpt-4o-2024-05-13
Source Types Counts False: Counter({'web-search': 280, 'tech-docs': 44, 'so-posts': 14, 'github': 10})
Source Types Distribution False: {'tech-docs': 0.12643678160919541, 'web-search': 0.8045977011494253, 'github': 0.028735632183908046, 'so-posts': 0.040229885057471264}
Source Types Counts True: Counter({'web-search': 896, 'tech-docs': 124, 'so-posts': 114, 'github': 18})
Source Types Distribution true: {'so-posts': 0.09895833333333333, 'web-search': 0.7777777777777778, 'tech-docs': 0.1076388888888889, 'github': 0.015625}
