**Extract co-changes across the commit history**

- co-changes of concepts
- co-changest of config file
- co-changes of config options

In [1]:
from analysis import analyze_repository
import json

repo_path = "/home/ssimon/projects/test-config-repo"
output_file = "../data/analyzed_projects/test-config-repo.json""../data/analyzed_projects/test-config-repo.json"
commit_data = analyze_repository(repo_path=repo_path, project_name="test-config-repo", get_diff=True)

# Store commit data into the output file
with open(output_file, "w", encoding="utf-8") as dest:
    json.dump(commit_data, dest, indent=4)

Number of commits: 6


Processing: 100%|██████████| 6/6 [00:00<00:00, 35.92it/s]

Len commit data: 6, 1.0
Elapsed time: 0.177337 seconds





**Compute Co-Changed Concepts**

In [36]:
import json
import pandas as pd
from itertools import combinations
from collections import Counter

# Load the JSON file
with open("../data/analyzed_projects/test-config-repo.json") as file:
    data = json.load(file)

# Counter to store co-changes
concept_counts = Counter()

# Process each commit
for commit in data.get('config_commit_data', []):
    concepts = commit['network_data'].get('concepts', [])

    # Generate pairs of technologies and pairs of files
    pairs = [tuple(sorted(pair)) for pair in combinations(concepts, 2)]
    
    # Count each pair
    concept_counts.update(pairs)


# Prepare DataFrame data
concept_rows = []
for (concept1, concept2), count in concept_counts.items():
    concept_rows.append({
        "Concept1": concept1,
        "Concept2": concept2,
        "Changed Internally": count,
        "Percentage Internally": round(count / len(data['config_commit_data']), 2)
    })

concept_df = pd.DataFrame(concept_rows)

concept_df.to_csv("../data/concept_cochanges/test-config-repo_concept_cochanges.csv", index=False)

concept_df

Unnamed: 0,Concept1,Concept2,Changed Internally,Percentage Internally
0,docker,maven,6,1.0
1,docker,docker-compose,5,0.83
2,docker-compose,maven,5,0.83
3,docker,yaml,3,0.5
4,maven,yaml,3,0.5
5,docker-compose,yaml,3,0.5
6,maven,spring,2,0.33
7,spring,yaml,2,0.33
8,docker,spring,2,0.33
9,docker-compose,spring,2,0.33


**Compute co-changed config files**

In [None]:
import json
import csv
from itertools import combinations
from collections import Counter

# Load the JSON file
with open("../data/analyzed_projects/test-config-repo.json") as file:
    data = json.load(file)

# Counter to store co-changes
file_pair_counts = Counter()

# Process each commit
for commit in data.get('config_commit_data', []):
    config_files = commit['network_data'].get('config_files', [])

    # Generate pairs of files
    file_pairs = [tuple(sorted(pair)) for pair in combinations(config_files, 2)]
    
    # Count each pair
    file_pair_counts.update(file_pairs)

# Prepare DataFrame data
file_rows = []
for (file1, file2), count in file_pair_counts.items():
    file_rows.append({
        "File1": file1,
        "File2": file2,
        "Changed Internally": count,
        "Percentage Internally": round(count / len(data['config_commit_data']), 2)
    })

file_df = pd.DataFrame(file_rows)

file_df.to_csv("../data/file_cochanges/test-config-repo_file_cochanges.csv", index=False)

file_df

Unnamed: 0,File1,File2,Changed Internally,Percentage Internally
0,src/Dockerfile,src/pom.xml,6,1.0
1,src/Dockerfile,src/docker-compose.yml,5,0.83
2,src/docker-compose.yml,src/pom.xml,5,0.83
3,src/config.yml,src/docker-compose.yml,3,0.5
4,src/Dockerfile,src/config.yml,3,0.5
5,src/config.yml,src/pom.xml,3,0.5
6,src/application.properties,src/config.yml,2,0.33
7,src/application.properties,src/pom.xml,2,0.33
8,src/application.properties,src/docker-compose.yml,2,0.33
9,src/Dockerfile,src/application.properties,2,0.33


**Compute co-changed config options**

In [30]:
import json
import csv
from itertools import combinations
from collections import Counter

# Load the JSON file
with open("../data/analyzed_projects/test-config-repo.json") as file:
    data = json.load(file)

# Counter to store co-changes
option_pair_counts = Counter()

# Process each commit
for commit in data.get('config_commit_data', []):    
    modified_options = {}
    
    # Collect options from all files in this commit
    for file_data in commit['network_data'].get('config_files_data', []):
        for pair in file_data.get('modified_pairs', []):
            modified_options[pair['option']] = pair

    # Generate unique pairs of modified options (by option name)
    option_names = list(modified_options.keys())
    option_pairs = [tuple(sorted(pair)) for pair in combinations(option_names, 2)]

    # Count occurrences of each pair and retain full details
    for option1, option2 in option_pairs:
        pair_details = (
            tuple(sorted(modified_options[option1].items())),
            tuple(sorted(modified_options[option2].items()))
        )
        option_pair_counts[pair_details] += 1

print("\n")
print("Co-Changed Modified Option Pair Counts (with Full Details):")
for (option1_details, option2_details), count in option_pair_counts.most_common():
    print(f"Pair: ({dict(option1_details)['option']} - {dict(option2_details)['option']})")
    print(f"  Details Option 1: {dict(option1_details)}")
    print(f"  Details Option 2: {dict(option2_details)}")
    print(f"  Count: {count}")



Co-Changed Modified Option Pair Counts (with Full Details):
Pair: (EXPOSE - server.port)
  Details Option 1: {'artifact': 'src/Dockerfile', 'curr_value': '8000', 'line': 21, 'option': 'EXPOSE', 'prev_value': '8080', 'type': 'PORT'}
  Details Option 2: {'artifact': 'src/application.properties', 'curr_value': '8000', 'line': '2', 'option': 'server.port', 'prev_value': '8080', 'type': 'PORT'}
  Count: 1


In [37]:
import pandas as pd

# Prepare DataFrame data
rows = []
for (option1_details, option2_details), count in option_pair_counts.items():
    # Extract details for Option1 and Option2
    opt1 = dict(option1_details)
    opt2 = dict(option2_details)
    rows.append({
        "Option1": opt1.get("option"),
        "Values1": (opt1.get("prev_value"), opt1.get("curr_value")),
        "Artifact1": opt1.get("artifact"),
        "Option2": opt2.get("option"),
        "Values2": (opt2.get("prev_value"), opt2.get("curr_value")),
        "Artifact2": opt2.get("artifact"),
        "Changed Internally": count,
        "Percentage Internally": count / len(data['config_commit_data'])
    })

# Create DataFrame
df = pd.DataFrame(rows)

df.to_csv("../data/option_cochanges/test-config-repo_option_cochanges.csv", index=False)

df

Unnamed: 0,Option1,Values1,Artifact1,Option2,Values2,Artifact2,Changed Internally,Percentage Internally
0,EXPOSE,"(8080, 8000)",src/Dockerfile,server.port,"(8080, 8000)",src/application.properties,1,0.166667
