In [None]:
#changed files using pydriller
import os
import csv
from pydriller import Repository
from git import Repo, GitCommandError
from collections import defaultdict
from datetime import datetime

def detect_fault_inducing_commits(repo_path, release_pairs, output_dir="buggy_files_reports"):
    """
    Enhanced fault-inducing commit detection with better blame handling and debugging.
    """
    os.makedirs(output_dir, exist_ok=True)
    git_repo = Repo(repo_path)
    results = {}
    project_name = repo_path.split("/")[-1]
    if project_name=="vlc-android":
        project_name="vlc"

    for start_release, end_release in release_pairs:
        try:
            print(f"\nAnalyzing {start_release} to {end_release}")
            
            # Get commit range
            start_commit = git_repo.tags[start_release].commit
            end_commit = git_repo.tags[end_release].commit
            
            csv_filename = os.path.join(output_dir, f"sqlite-{start_release}.csv")
            buggy_files = set()
            fix_count = 0
            
            with open(csv_filename, 'w', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=[
                    'buggy_file_path', 'fix_commit_hash', 'fix_date',
                    'buggy_commit_hash', 'buggy_date', 'fix_message'
                ])
                writer.writeheader()
                
                # Traverse commits with PyDriller
                for commit in Repository(
                    repo_path,
                    since=start_commit.committed_datetime,
                    to=end_commit.committed_datetime,
                    only_modifications_with_file_types=['.java', '.cpp', '.c', '.h']  # Filter by file type
                ).traverse_commits():
                    
#                     if not is_fix_commit(commit.msg):
#                         continue
                    
                    fix_count += 1
                    print(f"  Fix commit: {commit.hash[:8]} - {commit.msg[:50]}...")
                    
                    for modified_file in commit.modified_files:
                        if not modified_file.new_path:
                            continue
                            
                        try:
                            # Get previous version of the file
                            old_path = modified_file.old_path or modified_file.new_path
                            previous_contents = git_repo.git.show(f"{commit.hash}^:{old_path}")
                            current_contents = git_repo.git.show(f"{commit.hash}:{modified_file.new_path}")
                            
                            # Get changed lines
                            diff = get_changed_lines(previous_contents, current_contents)
                            if not diff:
                                continue
                                
                            # Find blame for changed lines
                            blame_output = git_repo.git.blame(
                                '-w', '-l', '-p',  # -w ignores whitespace, -l shows long hashes
                                f"{commit.hash}^",  # Look at parent commit
                                '--', modified_file.new_path
                            )
                            
                            buggy_commits = parse_blame_for_lines(blame_output, diff)
                            
                            for buggy_hash in buggy_commits:
                                try:
                                    buggy_commit = git_repo.commit(buggy_hash)
                                    # Prefer old_path for fault-inducing context, fallback to new_path
                                    buggy_file_path = modified_file.old_path or modified_file.new_path

                                    buggy_files.add(buggy_file_path)
                                    writer.writerow({
                                        'buggy_file_path': buggy_file_path,
                                        'fix_commit_hash': commit.hash,
                                        'fix_date': commit.committer_date,
                                        'buggy_commit_hash': buggy_hash,
                                        'buggy_date': buggy_commit.committed_datetime,
                                        'fix_message': commit.msg[:200].replace('\n', ' ')
                                    })

                                    print(f"    Found buggy commit: {buggy_hash[:8]} for {modified_file.new_path}")
                                    
                                except Exception as e:
                                    print(f"    Error processing buggy commit: {str(e)}")
                                    continue
                                    
                        except GitCommandError as e:
                            print(f"    Error processing {modified_file.new_path}: {str(e)}")
                            continue
            
            print(f"\nSummary for {start_release} to {end_release}:")
            print(f"  Fix commits analyzed: {fix_count}")
            print(f"  Buggy files found: {len(buggy_files)}")
            results[(start_release, end_release)] = buggy_files
            
        except Exception as e:
            print(f"Error processing {start_release}-{end_release}: {str(e)}")
            continue
    
    return results

def get_changed_lines(old_content, new_content):
    """Identify changed lines between two file versions"""
    old_lines = old_content.splitlines()
    new_lines = new_content.splitlines()
    diff = []
    
    for i, (old_line, new_line) in enumerate(zip(old_lines, new_lines)):
        if old_line != new_line:
            diff.append(i+1)  # Line numbers start at 1
    
    # Handle added/removed lines at the end
    len_diff = len(new_lines) - len(old_lines)
    if len_diff > 0:
        diff.extend(range(len(old_lines)+1, len(new_lines)+1))
    
    return diff

def parse_blame_for_lines(blame_output, target_lines):
    """Parse blame output for specific line numbers."""
    commits = set()
    current_line = 0
    commit_hash = None  # Ensure this is always defined

    for line in blame_output.split('\n'):
        if len(line) >= 40 and re.match(r'^[0-9a-f]{40}', line):  # New commit hash line
            commit_hash = line.split()[0]
        elif line.startswith('filename '):
            current_line += 1
        elif line.startswith('\t'):
            current_line += 1
            if current_line in target_lines and commit_hash:
                commits.add(commit_hash)

    return commits


def is_fix_commit(commit_message):
    """Enhanced fix commit detection"""
    error_keywords = [
        "fix", "crash", "resolves", "regression", "fall back", "assertion", "coverity",
    "reproducible", "stack-wanted", "steps-wanted", "testcase", "fail", "npe", "except",
    "broken", "bug", "differential testing", "error", "address sanitizer", "hang",
    "perma orange", "random orange", "intermittent", "steps to reproduce", "leak",
    "stack trace", "heap overflow", "freeze", "problem", "overflow", "avoid", "issue",
    "workaround", "break", "stop"
    ]
    
    lower_msg = commit_message.lower()
    
    # More sophisticated detection
    has_bug_number = any(word.isdigit() for word in lower_msg.split())
    has_issue_ref = '#' in lower_msg or 'issue' in lower_msg
    
    return any(kw in lower_msg for kw in error_keywords) or has_bug_number or has_issue_ref
# Define the release pairs you want to analyze
release_pairs_to_analyze = [
#   javacpp  ("0.5", "0.9"),
#         ("0.9", "1.1"),
#         ("1.1", "1.2"),
#         ("1.2", "1.2.1"),
#         ("1.2.1", "1.2.7"),
#         ("1.2.7", "1.3"),
#         ("1.3", "1.3.2"),
#         ("1.3.2", "1.4"),
#         ("1.4", "1.4.2"),
#         ("1.4.2", "1.4.4"),
#         ("1.4.4", "1.5"),
#         ("1.5", "1.5.1-1"),
#         ("1.5.1-1", "1.5.2"),
#       rocksdb ("v5.0.2", "v5.4.6"),
#         ("v5.4.6", "v5.6.2"),
#         ("v5.6.2", "v5.9.2"),
#         ("v5.9.2", "v5.11.2"),
#         ("v5.11.2", "v5.14.3"),
#         ("v5.14.3", "v5.17.2"),
#         ("v5.17.2", "v5.18.3"),
#         ("v5.18.3", "v6.1.1"),
#         ("v6.1.1", "v6.2.2"),
#         ("v6.2.2", "v6.2.4")
#    jpype   ("v0.5.4.5", "v0.5.5.1"),
#         ("v0.5.5.1", "v0.5.5.4"),
#         ("v0.5.5.4", "v0.5.6"),
#         ("v0.5.6", "v0.5.7"),
#         ("v0.5.7", "v0.6.0"),
#         ("v0.6.0", "v0.6.1"),
#         ("v0.6.1", "v0.6.2"),
#         ("v0.6.2", "v0.6.3"),
#         ("v0.6.3", "v0.7"),
#         ("v0.7", "v0.7.1"),
#         ("v0.7.1", "v0.7.2")
#      realm-java  ("v0.90.0", "v1.2.0"),
#        ("v1.2.0", "v2.3.2"),
#         ("v2.3.2", "v3.7.2"),
#         ("v3.7.2", "v4.4.0"),
#         ("v4.4.0", "v5.4.0"),
#         ("v5.4.0", "v5.7.1"),
#         ("v5.7.1", "v5.9.0"),
#         ("v5.9.0", "v5.11.0"),
#         ("v5.11.0", "v5.15.0"),
#         ("v5.15.0", "v6.0.0"),
#      zstd-jni  ("v0.4.4", "v1.3.0-1"),
#        ("v1.3.0-1", "v1.3.2-2"),
#         ("v1.3.2-2", "v1.3.3-1"),
#         ("v1.3.3-1", "v1.3.4-1"),
#         ("v1.3.4-1", "v1.3.4-8"),
#         ("v1.3.4-8", "v1.3.5-3"),
#         ("v1.3.5-3", "v1.3.7-1"),
#         ("v1.3.7-1", "v1.3.8-1"),
#         ("v1.3.8-1", "v1.4.0-1"),
#         ("v1.4.0-1", "v1.4.2-1"),
#         ("v1.4.2-1", "v1.4.4-3")
#     conscrypt  ("1.0.0.RC2", "1.0.0.RC8"),
#         ("1.0.0.RC8", "1.0.0.RC11"),
#         ("1.0.0.RC11", "1.0.0.RC14"),
#         ("1.0.0.RC14", "1.0.1"),
#         ("1.0.1", "1.0.2"),
#         ("1.0.2", "1.1.1"),
#         ("1.1.1", "1.2.0"),
#         ("1.2.0", "1.4.2"),
#         ("1.4.2", "2.1.0"),
#         ("2.1.0", "2.2.1")
#  java-smt   ("0.1", "0.3"),
#     ("0.3", "0.5"),
  #   ("0.5", "0.60"),
#     ("0.60", "1.0.1"),
#         ("1.0.1", "2.0.0"),
#         ("2.0.0", "2.0.0-alpha"),
#     ("2.0.0-alpha", "2.2.0"),
     #    ("2.2.0", "3.0.0"),
#     ("3.0.0", "3.1.0"),
#         ("3.1.0", "3.3.0")
# vlc  ("2.5.4", "3.0.0"),
#     ("3.0.0", "3.0.11"),
#         ("3.0.11", "3.0.13"),
#         ("3.0.13", "3.0.92"),
#         ("3.0.92", "3.0.96"),
#         ("3.0.96", "3.1.0"),
#         ("3.1.0", "3.1.2"),
#         ("3.1.2", "3.1.6"),
#         ("3.1.6", "3.1.7"),
#         ("3.1.7", "3.2.2")
# pljava    ("V1_2_0", "V1_3_0"),
#     ("V1_3_0", "V1_4_0"),
#     ("V1_4_0", "V1_4_2"),
#     ("V1_4_2", "V1_4_3"),
#   ("V1_4_3", "REL1_5_STABLE-BASE"),
#         ("REL1_5_STABLE-BASE", "V1_5_0b3"),
#         ("V1_5_0b3", "V1_5_0"),
#         ("V1_5_0", "V1_5_1b1"),
#         ("V1_5_1b1", "V1_5_1b2"),
#         ("V1_5_1b2", "V1_5_2"),
#         ("V1_5_2", "V1_5_3"),
#         ("V1_5_3", "V1_5_5")
   #sqlite
    ("3.42.0.1", "3.44.0.0"),
    ("3.44.0.0", "3.45.0.0"),
    ("3.45.0.0", "3.45.2.0"),
    ("3.45.2.0", "3.46.0.0"),
    ("3.46.0.0", "3.46.1.1"),
    ("3.46.1.1", "3.47.0.0"),
    ("3.47.0.0", "3.47.2.0"),
    ("3.47.2.0", "3.49.0.0"),
    ("3.49.0.0", "3.50.1.0"),
    ("3.50.1.0", "3.50.3.0")
    #jni-bind
#     ("Release-0.8.0-alpha","Release-0.9.1-alpha"),
#     ("Release-0.9.1-alpha","Release-0.9.3-alpha"),
#     ("Release-0.9.3-alpha","Release-0.9.6-alpha"),
#     ("Release-0.9.6-alpha","Release-0.9.7-alpha"),
#     ("Release-0.9.7-alpha","Release-0.9.8-alpha"),
#     ("Release-0.9.8-alpha","Release-0.9.9-alpha"),
#     ("Release-0.9.9-alpha","Release-1.0.0-beta"),
#     ("Release-1.0.0-beta","Release-1.1.0-beta"),
#     ("Release-1.1.0-beta","Release-1.1.2-beta"),
#     ("Release-1.1.2-beta","Release-1.2.3")
    #Monero-java
    
#     ("v0.8.9","v0.8.10"),
#     ("v0.8.10","v0.8.13"),
#     ("v0.8.13","v0.8.17"),
#     ("v0.8.17","v0.8.24"),
#     ("v0.8.24","v0.8.31"),
#     ("v0.8.31","v0.8.35"),
#     ("v0.8.35","v0.8.36"),
#     ("v0.8.36","v0.8.37"),
#     ("v0.8.37","v0.8.38"),
#     ("v0.8.38","v0.8.39")
    #webrtc
#     ("v0.2.0", "v0.3.0"),
#     ("v0.3.0", "v0.4.0"),
#     ("v0.4.0", "v0.6.0"),
#     ("v0.6.0", "v0.7.0"),
#     ("v0.7.0", "v0.8.0"),
#     ("v0.8.0", "v0.10.0"),
#     ("v0.10.0", "v0.11.0"),
#     ("v0.11.0", "v0.12.0"),
#     ("v0.12.0", "v0.13.0"),
#     ("v0.13.0", "v0.14.0")
    #wolfcrypt
#     ("v1.0.0-stable", "v1.1.0-stable"),
#     ("v1.1.0-stable", "v1.2.0-stable"),
#     ("v1.2.0-stable", "v1.3.0-stable"),
#     ("v1.3.0-stable", "v1.5.0-stable"),
#     ("v1.5.0-stable", "v1.6.0-stable"),
#     ("v1.6.0-stable", "v1.7.0-stable"),
#     ("v1.7.0-stable", "v1.8.0-stable")
    #wolfssl
#     ("v1.4.0-stable", "v1.5.0-stable"),
#     ("v1.5.0-stable", "v1.6.0-stable"),
#     ("v1.6.0-stable", "v1.8.0-stable"),
#     ("v1.8.0-stable", "v1.9.0-stable"),
#     ("v1.9.0-stable", "v1.11.0-stable"),
#     ("v1.11.0-stable", "v1.12.0-stable"),
#     ("v1.12.0-stable", "v1.12.2"),
#     ("v1.12.2", "v1.13.0-stable"),
#     ("v1.13.0-stable", "v1.14.0-stable"),
#     ("v1.14.0-stable", "v1.15.0-stable")
    
]

# Run the analysis
results = detect_fault_inducing_commits(
    repo_path="revision projects/sqlite/3.50.3.0",
    release_pairs=release_pairs_to_analyze,
    output_dir="buggy_smelly/revision/general_changed"
)

# Access results programmatically if needed
for release_pair, buggy_files in results.items():
    print(f"Between {release_pair[0]} and {release_pair[1]}, found {len(buggy_files)} buggy files")

In [None]:
#faulty files using pydriller and fault inducing commits
import os
import csv
from pydriller import Repository
from git import Repo, GitCommandError
from collections import defaultdict
from datetime import datetime

def detect_fault_inducing_commits(repo_path, release_pairs, output_dir="buggy_files_reports"):
    """
    Enhanced fault-inducing commit detection with better blame handling and debugging.
    """
    os.makedirs(output_dir, exist_ok=True)
    git_repo = Repo(repo_path)
    results = {}
    project_name = repo_path.split("/")[-1]
    if project_name=="vlc-android":
        project_name="vlc"

    for start_release, end_release in release_pairs:
        try:
            print(f"\nAnalyzing {start_release} to {end_release}")
            
            # Get commit range
            start_commit = git_repo.tags[start_release].commit
            end_commit = git_repo.tags[end_release].commit
            
            csv_filename = os.path.join(output_dir, f"wolfssl-{start_release}.csv")
            buggy_files = set()
            fix_count = 0
            
            with open(csv_filename, 'w', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=[
                    'buggy_file_path', 'fix_commit_hash', 'fix_date',
                    'buggy_commit_hash', 'buggy_date', 'fix_message'
                ])
                writer.writeheader()
                
                # Traverse commits with PyDriller
                for commit in Repository(
                    repo_path,
                    since=start_commit.committed_datetime,
                    to=end_commit.committed_datetime,
                    only_modifications_with_file_types=['.java', '.cpp', '.c', '.h']  # Filter by file type
                ).traverse_commits():
                    
                    if not is_fix_commit(commit.msg):
                        continue
                    
                    fix_count += 1
                    print(f"  Fix commit: {commit.hash[:8]} - {commit.msg[:50]}...")
                    
                    for modified_file in commit.modified_files:
                        if not modified_file.new_path:
                            continue
                            
                        try:
                            # Get previous version of the file
                            old_path = modified_file.old_path or modified_file.new_path
                            previous_contents = git_repo.git.show(f"{commit.hash}^:{old_path}")
                            current_contents = git_repo.git.show(f"{commit.hash}:{modified_file.new_path}")
                            
                            # Get changed lines
                            diff = get_changed_lines(previous_contents, current_contents)
                            if not diff:
                                continue
                                
                            # Find blame for changed lines
                            blame_output = git_repo.git.blame(
                                '-w', '-l', '-p',  # -w ignores whitespace, -l shows long hashes
                                f"{commit.hash}^",  # Look at parent commit
                                '--', modified_file.new_path
                            )
                            
                            buggy_commits = parse_blame_for_lines(blame_output, diff)
                            
                            for buggy_hash in buggy_commits:
                                try:
                                    buggy_commit = git_repo.commit(buggy_hash)
                                    # Prefer old_path for fault-inducing context, fallback to new_path
                                    buggy_file_path = modified_file.old_path or modified_file.new_path

                                    buggy_files.add(buggy_file_path)
                                    writer.writerow({
                                        'buggy_file_path': buggy_file_path,
                                        'fix_commit_hash': commit.hash,
                                        'fix_date': commit.committer_date,
                                        'buggy_commit_hash': buggy_hash,
                                        'buggy_date': buggy_commit.committed_datetime,
                                        'fix_message': commit.msg[:200].replace('\n', ' ')
                                    })

                                    print(f"    Found buggy commit: {buggy_hash[:8]} for {modified_file.new_path}")
                                    
                                except Exception as e:
                                    print(f"    Error processing buggy commit: {str(e)}")
                                    continue
                                    
                        except GitCommandError as e:
                            print(f"    Error processing {modified_file.new_path}: {str(e)}")
                            continue
            
            print(f"\nSummary for {start_release} to {end_release}:")
            print(f"  Fix commits analyzed: {fix_count}")
            print(f"  Buggy files found: {len(buggy_files)}")
            results[(start_release, end_release)] = buggy_files
            
        except Exception as e:
            print(f"Error processing {start_release}-{end_release}: {str(e)}")
            continue
    
    return results

def get_changed_lines(old_content, new_content):
    """Identify changed lines between two file versions"""
    old_lines = old_content.splitlines()
    new_lines = new_content.splitlines()
    diff = []
    
    for i, (old_line, new_line) in enumerate(zip(old_lines, new_lines)):
        if old_line != new_line:
            diff.append(i+1)  # Line numbers start at 1
    
    # Handle added/removed lines at the end
    len_diff = len(new_lines) - len(old_lines)
    if len_diff > 0:
        diff.extend(range(len(old_lines)+1, len(new_lines)+1))
    
    return diff

def parse_blame_for_lines(blame_output, target_lines):
    """Parse blame output for specific line numbers."""
    commits = set()
    current_line = 0
    commit_hash = None  # Ensure this is always defined

    for line in blame_output.split('\n'):
        if len(line) >= 40 and re.match(r'^[0-9a-f]{40}', line):  # New commit hash line
            commit_hash = line.split()[0]
        elif line.startswith('filename '):
            current_line += 1
        elif line.startswith('\t'):
            current_line += 1
            if current_line in target_lines and commit_hash:
                commits.add(commit_hash)

    return commits


def is_fix_commit(commit_message):
    """Enhanced fix commit detection"""
    error_keywords = [
        "fix", "crash", "resolves", "regression", "fall back", "assertion", "coverity",
    "reproducible", "stack-wanted", "steps-wanted", "testcase", "fail", "npe", "except",
    "broken", "bug", "differential testing", "error", "address sanitizer", "hang",
    "perma orange", "random orange", "intermittent", "steps to reproduce","assertion", "leak",
    "stack trace", "heap overflow", "freez","str", "problem", "overflow", "avoid", "issue",
    "workaround", "break", "stop"
    ]
    
    lower_msg = commit_message.lower()
    
    # More sophisticated detection
    has_bug_number = any(word.isdigit() for word in lower_msg.split())
    has_issue_ref = '#' in lower_msg or 'issue' in lower_msg
    
    return any(kw in lower_msg for kw in error_keywords) or has_bug_number or has_issue_ref
# Define the release pairs you want to analyze
release_pairs_to_analyze = [
#    javacpp ("0.5", "0.9"),
#         ("0.9", "1.1"),
#         ("1.1", "1.2"),
#         ("1.2", "1.2.1"),
#         ("1.2.1", "1.2.7"),
#         ("1.2.7", "1.3"),
#         ("1.3", "1.3.2"),
#         ("1.3.2", "1.4"),
#         ("1.4", "1.4.2"),
#         ("1.4.2", "1.4.4"),
#         ("1.4.4", "1.5"),
#         ("1.5", "1.5.1-1"),
#         ("1.5.1-1", "1.5.2"),

    #     ("v5.0.2", "v5.4.6"),
#         ("v5.4.6", "v5.6.2"),
#         ("v5.6.2", "v5.9.2"),
#         ("v5.9.2", "v5.11.2"),
#         ("v5.11.2", "v5.14.3"),
#         ("v5.14.3", "v5.17.2"),
#         ("v5.17.2", "v5.18.3"),
#         ("v5.18.3", "v6.1.1"),
#         ("v6.1.1", "v6.2.2"),
#         ("v6.2.2", "v6.2.4")
#     jpype ("v0.5.4.5", "v0.5.5.1"),
#         ("v0.5.5.1", "v0.5.5.4"),
#         ("v0.5.5.4", "v0.5.6"),
#         ("v0.5.6", "v0.5.7"),
#         ("v0.5.7", "v0.6.0"),
#         ("v0.6.0", "v0.6.1"),
#         ("v0.6.1", "v0.6.2"),
#         ("v0.6.2", "v0.6.3"),
#         ("v0.6.3", "v0.7"),
      #  ("v0.7", "v0.7.1")
   # ("v0.7.1", "v0.7.2")
  
#     realm-java   ("v0.90.0", "v1.2.0"),
#        ("v1.2.0", "v2.3.2"),
#         ("v2.3.2", "v3.7.2"),
#         ("v3.7.2", "v4.4.0"),
#         ("v4.4.0", "v5.4.0"),
#         ("v5.4.0", "v5.7.1"),
#         ("v5.7.1", "v5.9.0"),
#         ("v5.9.0", "v5.11.0"),
#         ("v5.11.0", "v5.15.0"),
#         ("v5.15.0", "v6.0.0"),
#     zstd-jni   ("v0.4.4", "v1.3.0-1"),
#        ("v1.3.0-1", "v1.3.2-2"),
#         ("v1.3.2-2", "v1.3.3-1"),
#         ("v1.3.3-1", "v1.3.4-1"),
#         ("v1.3.4-1", "v1.3.4-8"),
#         ("v1.3.4-8", "v1.3.5-3"),
#         ("v1.3.5-3", "v1.3.7-1"),
#         ("v1.3.7-1", "v1.3.8-1"),
#         ("v1.3.8-1", "v1.4.0-1"),
#         ("v1.4.0-1", "v1.4.2-1"),
#         ("v1.4.2-1", "v1.4.4-3")
#      conscrypt ("1.0.0.RC2", "1.0.0.RC8"),
#         ("1.0.0.RC8", "1.0.0.RC11"),
#         ("1.0.0.RC11", "1.0.0.RC14"),
#         ("1.0.0.RC14", "1.0.1"),
#         ("1.0.1", "1.0.2"),
#         ("1.0.2", "1.1.1"),
#         ("1.1.1", "1.2.0"),
#         ("1.2.0", "1.4.2"),
#         ("1.4.2", "2.1.0"),
#         ("2.1.0", "2.2.1")
# java-smt ("0.1", "0.3"),
#     ("0.3", "0.5"),
 #   ("0.5", "0.60"),
#     ("0.60", "1.0.1"),
#         ("1.0.1", "2.0.0"),
#         ("2.0.0", "2.0.0-alpha"),
#     ("2.0.0-alpha", "2.2.0"),
  #      ("2.2.0", "3.0.0"),
#     ("3.0.0", "3.1.0"),
#         ("3.1.0", "3.3.0")
# vlv ("2.5.4", "3.0.0"),
#     ("3.0.0", "3.0.11"),
#         ("3.0.11", "3.0.13"),
#         ("3.0.13", "3.0.92"),
#         ("3.0.92", "3.0.96"),
#         ("3.0.96", "3.1.0"),
#         ("3.1.0", "3.1.2"),
#         ("3.1.2", "3.1.6"),
#         ("3.1.6", "3.1.7"),
#         ("3.1.7", "3.2.2")
# pljava  ("V1_2_0", "V1_3_0"),
#     ("V1_3_0", "V1_4_0"),
#     ("V1_4_0", "V1_4_2"),
#     ("V1_4_2", "V1_4_3"),
#   ("V1_4_3", "REL1_5_STABLE-BASE"),
#         ("REL1_5_STABLE-BASE", "V1_5_0b3"),
#         ("V1_5_0b3", "V1_5_0"),
#         ("V1_5_0", "V1_5_1b1"),
#         ("V1_5_1b1", "V1_5_1b2"),
#         ("V1_5_1b2", "V1_5_2"),
#         ("V1_5_2", "V1_5_3"),
#         ("V1_5_3", "V1_5_5")
    #sqlite
#     ("3.42.0.1", "3.44.0.0"),
#     ("3.44.0.0", "3.45.0.0"),
#     ("3.45.0.0", "3.45.2.0"),
#     ("3.45.2.0", "3.46.0.0"),
#     ("3.46.0.0", "3.46.1.1"),
#     ("3.46.1.1", "3.47.0.0"),
#     ("3.47.0.0", "3.47.2.0"),
#     ("3.47.2.0", "3.49.0.0"),
#     ("3.49.0.0", "3.50.1.0"),
#     ("3.50.1.0", "3.50.3.0")
    #jni-bind
#     ("Release-0.8.0-alpha","Release-0.9.1-alpha"),
#     ("Release-0.9.1-alpha","Release-0.9.3-alpha"),
#     ("Release-0.9.3-alpha","Release-0.9.6-alpha"),
#     ("Release-0.9.6-alpha","Release-0.9.7-alpha"),
#     ("Release-0.9.7-alpha","Release-0.9.8-alpha"),
#     ("Release-0.9.8-alpha","Release-0.9.9-alpha"),
#     ("Release-0.9.9-alpha","Release-1.0.0-beta"),
#     ("Release-1.0.0-beta","Release-1.1.0-beta"),
#     ("Release-1.1.0-beta","Release-1.1.2-beta"),
#     ("Release-1.1.2-beta","Release-1.2.3")
    #Monero-java
    
#     ("v0.8.9","v0.8.10"),
#     ("v0.8.10","v0.8.13"),
#     ("v0.8.13","v0.8.17"),
#     ("v0.8.17","v0.8.24"),
#     ("v0.8.24","v0.8.31"),
#     ("v0.8.31","v0.8.35"),
#     ("v0.8.35","v0.8.36"),
#     ("v0.8.36","v0.8.37"),
#     ("v0.8.37","v0.8.38"),
#     ("v0.8.38","v0.8.39")
    #webrtc
#     ("v0.2.0", "v0.3.0"),
#     ("v0.3.0", "v0.4.0"),
#     ("v0.4.0", "v0.6.0"),
#     ("v0.6.0", "v0.7.0"),
#     ("v0.7.0", "v0.8.0"),
#     ("v0.8.0", "v0.10.0"),
#     ("v0.10.0", "v0.11.0"),
#     ("v0.11.0", "v0.12.0"),
#     ("v0.12.0", "v0.13.0"),
#     ("v0.13.0", "v0.14.0")
    #wolfcrypt
#     ("v1.0.0-stable", "v1.1.0-stable"),
#     ("v1.1.0-stable", "v1.2.0-stable"),
#     ("v1.2.0-stable", "v1.3.0-stable"),
#     ("v1.3.0-stable", "v1.5.0-stable"),
#     ("v1.5.0-stable", "v1.6.0-stable"),
#     ("v1.6.0-stable", "v1.7.0-stable"),
#     ("v1.7.0-stable", "v1.8.0-stable")
    #wolfssl
    ("v1.4.0-stable", "v1.5.0-stable"),
    ("v1.5.0-stable", "v1.6.0-stable"),
    ("v1.6.0-stable", "v1.8.0-stable"),
    ("v1.8.0-stable", "v1.9.0-stable"),
    ("v1.9.0-stable", "v1.11.0-stable"),
    ("v1.11.0-stable", "v1.12.0-stable"),
    ("v1.12.0-stable", "v1.12.2"),
    ("v1.12.2", "v1.13.0-stable"),
    ("v1.13.0-stable", "v1.14.0-stable"),
    ("v1.14.0-stable", "v1.15.0-stable"),
    
]

# Run the analysis
results = detect_fault_inducing_commits(
    repo_path="revision projects/wolfssl/v1.15.0-stable",
    release_pairs=release_pairs_to_analyze,
    output_dir="buggy_smelly/revision/general_faulty"
)

# Access results programmatically if needed
for release_pair, buggy_files in results.items():
    print(f"Between {release_pair[0]} and {release_pair[1]}, found {len(buggy_files)} buggy files")

In [None]:
# dataset for LR containing all 4 smells with their frequencies releasewise
#containment

import os
import pandas as pd

# ----------------------------
# CONFIGURATION
# ----------------------------
UNIVERSAL_FOLDERS = [
    "new smells/On extended dataset/Java files",
    "new smells/On extended dataset/Native Java files",
    "new smells/On extended dataset/Native C files"
]

SMELL_DIRS = {
    "LanguageEnvy": "new smells/On extended dataset/LanguageEnvy",
    "CrossRespDecl": "new smells/On extended dataset/crossLangDeclaration",
    "ShotgunSurgery": "new smells/On extended dataset/ShotgunSurgery",
    "EncapsulationBypass": "new smells/On extended dataset/EncapsulationBypass"
}

METRICS_DIR = "new smells/On extended dataset/codeChurn_LOC"
FAULTY_DIR = "new smells/On extended dataset/general_faulty"
OUTPUT_DIR = "new smells/On extended dataset/release wise fault dataset"

os.makedirs(OUTPUT_DIR, exist_ok=True)


# ----------------------------
# HELPER FUNCTIONS
# ----------------------------
def load_smell_counts(csv_path):
    """Load smell CSV and count how many times each file appears."""
    if not os.path.exists(csv_path):
        return {}
    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
    file_col = [c for c in df.columns if "file" in c.lower()]
    if not file_col:
        return {}
    file_col = file_col[0]
    return df[file_col].value_counts().to_dict()  # file -> count


def load_file_list(csv_path):
    """Load a set of file paths from a CSV file."""
    if not os.path.exists(csv_path):
        return set()
    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
    file_col = [c for c in df.columns if "file" in c.lower()]
    if not file_col:
        return set()
    return set(df[file_col[0]].dropna().tolist())


def load_metrics(csv_path):
    """Load LOC, Churn, and Previous Bugs metrics into a dict keyed by file path."""
    metrics = {}
    if not os.path.exists(csv_path):
        print("************************csv file doesnot exist")
        return metrics
    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
    # find file column robustly
    fcols = [c for c in df.columns if "file" in c.lower()]
    if not fcols:
        print("*********************filecol doesnot exist")
        return metrics
    file_col = fcols[0]
    for _, row in df.iterrows():
        fname = str(row[file_col]).strip()
        # try parse ints robustly
        def safe_int(x):
            try:
                return int(float(x))
            except Exception:
                return 0
        loc = safe_int(row.get("loc", 0))
        churn = safe_int(row.get("code_churn", 0))
        prev_fix = safe_int(row.get("previous_fixes", 0))
        metrics[fname] = (loc, churn, prev_fix)
    return metrics


def gather_universal_file_set(project_name):
    """Combine file lists from Java, Native Java, and Native C sources."""
    all_files = set()
    for folder in UNIVERSAL_FOLDERS:
        csv_path = os.path.join(folder, f"{project_name}.csv")
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
            file_col = [c for c in df.columns if "file" in c.lower()]
            if file_col:
                all_files.update([x for x in df[file_col[0]].dropna().tolist()])
    return list(all_files)


# ----------------------------
# Matching helpers (containment-based)
# ----------------------------
def basename(path):
    return os.path.basename(path) if path else path


def smell_count_by_containment(smell_dict, universal_fname):
    """
    Sum counts for smell keys that contain the universal filename or its basename.
    smell_dict: {smell_path: count}
    universal_fname: string (from universal file list)
    """
    if not smell_dict or not universal_fname:
        return 0
    uni = universal_fname.strip()
    uni_base = basename(uni)
    total = 0
    for k, v in smell_dict.items():
        if not isinstance(k, str):
            continue
        k_str = k.strip()
        # match if smell key contains the universal path OR the basename
        if uni in k_str or uni_base and uni_base in k_str:
            try:
                total += int(v)
            except Exception:
                try:
                    total += int(float(v))
                except Exception:
                    continue
    return total


def is_faulty_by_containment(faulty_set, universal_fname):
    """
    Return True if any faulty path contains the universal filename (or its basename).
    """
    if not faulty_set or not universal_fname:
        return False
    uni = universal_fname.strip()
    uni_base = basename(uni)
    for k in faulty_set:
        if not isinstance(k, str):
            continue
        k_str = k.strip()
        if uni in k_str or (uni_base and uni_base in k_str):
            return True
    return False


def find_metrics_by_containment(metrics_dict, universal_fname):
    """
    Return metrics tuple for the best match:
    1) exact match
    2) if not found, find the first metrics key that contains the universal filename or vice versa, or contains basename.
    """
    if not metrics_dict or not universal_fname:
        return (0, 0, 0)
    uni = universal_fname.strip()
    uni_base = basename(uni)

    # exact
    if uni in metrics_dict:
        return metrics_dict[uni]
    # try basename exact
    if uni_base in metrics_dict:
        return metrics_dict[uni_base]

    # try containment (metrics_key contains uni OR uni contains metrics_key OR basename matches)
    for k, v in metrics_dict.items():
        if not isinstance(k, str):
            continue
        k_str = k.strip()
        if uni in k_str or k_str in uni or (uni_base and uni_base in k_str) or (basename(k_str) == uni_base):
            return v

    # fallback
    return (0, 0, 0)


# ----------------------------
# MAIN DATASET CREATION FUNCTION
# ----------------------------
def create_unified_dataset(project_name):
    print(f"\nüîç Processing project: {project_name}")

    # 1Ô∏è‚É£ Gather universal file set (from Java + Native sources)
    all_files = gather_universal_file_set(project_name)
    if not all_files:
        print(f"‚ö†Ô∏è No files found for {project_name}")
        return

    # 2Ô∏è‚É£ Load smell frequencies (per smell type)
    smell_data = {}
    for smell_name, folder in SMELL_DIRS.items():
        smell_path = os.path.join(folder, f"{project_name}.csv")
        if not os.path.exists(smell_path):
            print(f"‚ö†Ô∏è Missing smell file: {smell_path}")
            smell_data[smell_name] = {}
        else:
            smell_data[smell_name] = load_smell_counts(smell_path)

    # 3Ô∏è‚É£ Load faulty files (set)
    faulty_path = os.path.join(FAULTY_DIR, f"{project_name}.csv")
    faulty_files = load_file_list(faulty_path)

    # 4Ô∏è‚É£ Load metrics dictionary
    metrics_path = os.path.join(METRICS_DIR, f"{project_name}.csv")
    metrics_dict = load_metrics(metrics_path)

    # 5Ô∏è‚É£ Combine everything (use containment-based matching)
    dataset = []
    for f in all_files:
        # Smell frequencies per type using containment matching
        smell_freqs = {s: smell_count_by_containment(smell_data.get(s, {}), f) for s in SMELL_DIRS.keys()}
        total_freq = sum(smell_freqs.values())
        if total_freq == 0:
            continue  # keep only smelly files (same as original behavior)

        loc, code_churn, previous_fixes = find_metrics_by_containment(metrics_dict, f)
        fault_prone = 1 if is_faulty_by_containment(faulty_files, f) else 0

        row = {
            "file": f,
            **smell_freqs,
            "LOC": loc,
            "Code Churn": code_churn,
            "Previous Bugs": previous_fixes,
            "fault_prone": fault_prone
        }
        dataset.append(row)

    # 6Ô∏è‚É£ Save dataset
    if not dataset:
        print(f"‚ö†Ô∏è No smelly files found for {project_name}")
        return

    df_final = pd.DataFrame(dataset)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    output_path = os.path.join(OUTPUT_DIR, f"{project_name}.csv")
    df_final.to_csv(output_path, index=False)
    print(f"‚úÖ Saved: {output_path} ({len(df_final)} smelly files)")


# ----------------------------
# MAIN SCRIPT ENTRY POINT
# ----------------------------
def main():
    java_files = []
    for folder in UNIVERSAL_FOLDERS:
        if os.path.exists(folder):
            java_files += [f for f in os.listdir(folder) if f.endswith(".csv")]

    # Deduplicate project names (same project may appear in multiple folders)
    projects = list(set(os.path.splitext(f)[0] for f in java_files))

    for project_name in sorted(projects):
        create_unified_dataset(project_name)


if __name__ == "__main__":
    main()


In [None]:
#combine releasewise dataset into project wise
import os
import pandas as pd
import re

# ----------------------------
# CONFIGURATION
# ----------------------------
INPUT_DIR = "new smells/On extended dataset/release wise fault dataset"        # Folder with release-wise CSVs
OUTPUT_DIR = "new smells/On extended dataset/project level fault datasets"       # Folder for merged project-wise CSVs
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------------------
# HELPER FUNCTION
# ----------------------------
def extract_project_name(filename):
    """
    Extracts the base project name (letters only, before any digits or underscores).
    e.g., 'conscrypt1_unified_regression.csv' -> 'conscrypt'
          'rocksdb_2_unified_regression.csv' -> 'rocksdb'
    """
    base = os.path.splitext(filename)[0]
    # remove suffix like "_unified_regression" if present
    base = re.sub(r"_unified_regression$", "", base)
    # take only the alphabetic prefix as project name
    match = re.match(r"([A-Za-z_-]+)", base)
    return match.group(1).lower() if match else base.lower()


# ----------------------------
# MAIN MERGE FUNCTION
# ----------------------------
def merge_release_datasets():
    all_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".csv")]
    project_groups = {}

    # Group files by project name
    for file in all_files:
        project = extract_project_name(file)
        project_groups.setdefault(project, []).append(file)

    print(f"üîç Found {len(project_groups)} projects to merge.\n")

    # Merge each project's releases
    for project, files in project_groups.items():
        print(f"üìÇ Merging {len(files)} releases for project: {project}")

        combined_df = pd.DataFrame()

        for file in sorted(files):
            file_path = os.path.join(INPUT_DIR, file)
            try:
                df = pd.read_csv(file_path)
                df["Release_File"] = file  # optional, track origin release
                combined_df = pd.concat([combined_df, df], ignore_index=True)
            except Exception as e:
                print(f"‚ö†Ô∏è Error reading {file}: {e}")

        if not combined_df.empty:
            output_path = os.path.join(OUTPUT_DIR, f"{project}_all.csv")
            combined_df.to_csv(output_path, index=False)
            print(f"‚úÖ Saved merged dataset: {output_path} ({len(combined_df)} rows)")
        else:
            print(f"‚ö†Ô∏è No data merged for {project} (empty CSVs).")

    print("\nüéâ All project-level datasets created successfully!")


# ----------------------------
# RUN SCRIPT
# ----------------------------
if __name__ == "__main__":
    merge_release_datasets()


In [None]:
#logistic regression

In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statsmodels.tools.sm_exceptions import PerfectSeparationError

# ----------------------------
# CONFIGURATION
# ----------------------------
INPUT_DIR = "new smells/On extended dataset/project level fault datasets"   # Folder with merged project datasets
OUTPUT_CSV = "new smells/On extended dataset/projectwise_logistic_results_fault.csv"

# Columns to use in regression
SMELL_COLS = ["LanguageEnvy", "CrossRespDecl", "ShotgunSurgery", "EncapsulationBypass"]
#METRIC_COLS = ["LOC", "Code Churn", "Previous Bugs"]
METRIC_COLS = []

TARGET_COL = "fault_prone"

# ----------------------------
# HELPER FUNCTION
# ----------------------------
def drop_zero_variance_columns(df, cols):
    """Drop columns that have no variation (all values same)."""
    drop_cols = [c for c in cols if df[c].nunique() <= 1]
    if drop_cols:
        print(f"‚ö†Ô∏è Dropping constant columns: {drop_cols}")
        df = df.drop(columns=drop_cols)
    return df


# ----------------------------
# MAIN REGRESSION FUNCTION
# ----------------------------
def run_logistic_regression(project_name, df):
    # Drop rows with missing data
    df = df.dropna(subset=SMELL_COLS + METRIC_COLS + [TARGET_COL])

    # Ensure binary target
    if df[TARGET_COL].nunique() < 2:
        print(f"‚ö†Ô∏è Skipping {project_name}: target variable not binary.")
        return None

    # Drop constant (zero variance) predictors
    df = drop_zero_variance_columns(df, SMELL_COLS + METRIC_COLS)

    # Define independent & dependent variables
    predictors = [c for c in df.columns if c in SMELL_COLS + METRIC_COLS]
    X = df[predictors]
    y = df[TARGET_COL]

    # Add intercept
    X = sm.add_constant(X)

    # Try logistic regression (statsmodels)
    try:
        model = sm.Logit(y, X)
        result = model.fit(disp=False)
    except (PerfectSeparationError, np.linalg.LinAlgError, ValueError) as e:
        print(f"‚ö†Ô∏è Logistic regression failed for {project_name}: {e}")
        return None

    # Predict to compute performance
    df["pred_prob"] = result.predict(X)
    df["pred_label"] = (df["pred_prob"] >= 0.5).astype(int)
    acc = accuracy_score(y, df["pred_label"])
    prec = precision_score(y, df["pred_label"], zero_division=0)
    rec = recall_score(y, df["pred_label"], zero_division=0)
    f1 = f1_score(y, df["pred_label"], zero_division=0)

    pseudo_r2 = 1 - (result.llf / result.llnull)

    # Identify statistically significant variables (p < 0.05)
    sig_vars = result.pvalues[result.pvalues < 0.05].index.tolist()
    if "const" in sig_vars:
        sig_vars.remove("const")

    print(f"\n‚úÖ {project_name} Regression Summary")
    print(f"  Accuracy: {acc:.3f} | F1: {f1:.3f} | Pseudo-R¬≤: {pseudo_r2:.3f}")
    print(f"  Significant predictors (p < 0.05): {sig_vars if sig_vars else 'None'}")
    print(result.summary())

    # Store results
    res = {
        "Project": project_name,
        "NumFiles": len(df),
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "Pseudo_R2": pseudo_r2,
        "Significant_Predictors": ", ".join(sig_vars) if sig_vars else "None"
    }

    # Add coefficients and p-values
    for param, coef in result.params.items():
        res[f"Coef_{param}"] = coef
    for param, pval in result.pvalues.items():
        res[f"Pval_{param}"] = pval

    return res


# ----------------------------
# MAIN EXECUTION
# ----------------------------
def main():
    results = []

    for file in sorted(os.listdir(INPUT_DIR)):
        if not file.endswith(".csv"):
            continue
        project_name = file.replace("_all.csv", "")
        print(f"\nüîç Running regression for: {project_name}")

        df = pd.read_csv(os.path.join(INPUT_DIR, file))
        if df.empty:
            print(f"‚ö†Ô∏è Skipping {project_name}: empty dataset.")
            continue

        res = run_logistic_regression(project_name, df)
        if res:
            results.append(res)

    if results:
        out_df = pd.DataFrame(results)
        out_df.to_csv(OUTPUT_CSV, index=False)
        print(f"\nüìä Saved logistic regression summary: {OUTPUT_CSV}")
    else:
        print("\n‚ùå No successful regressions.")


if __name__ == "__main__":
    main()


In [None]:
#combined data logistic regression
import numpy as np
import os
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

# Path to your project-level datasets
base_dir = "new smells/On extended dataset/project level fault datasets"

# Collect all CSV files
project_dfs = []
for file in os.listdir(base_dir):
    if file.endswith(".csv"):
        project_name = file.replace(".csv", "")
        df = pd.read_csv(os.path.join(base_dir, file))
        
        # Ensure project column exists
        df["project"] = project_name
        
        # Keep only relevant columns (adjust if needed)
        expected_cols = ["LanguageEnvy", "CrossRespDecl", "ShotgunSurgery", 
                         "EncapsulationBypass", "fa_prone", "project"]
        df = df[[col for col in expected_cols if col in df.columns]]
        
        # Drop rows with missing values (if any)
        df.dropna(subset=["change_prone"], inplace=True)
        
        # Keep only binary 0/1 fault_prone
        df = df[df["change_prone"].isin([0, 1])]
        
        project_dfs.append(df)

# Combine all project data into one DataFrame
all_data = pd.concat(project_dfs, ignore_index=True)

print(f"‚úÖ Combined dataset shape: {all_data.shape}")
print(f"üìä Projects included: {all_data['project'].unique().tolist()}")

# Convert categorical project variable
all_data["project"] = all_data["project"].astype("category")

# Optional normalization for predictors (per project)
for col in ["LanguageEnvy", "CrossRespDecl", "ShotgunSurgery", "EncapsulationBypass"]:
    if col in all_data.columns:
        all_data[col] = all_data.groupby("project")[col].transform(
            lambda x: (x - x.mean()) / x.std(ddof=0) if x.std(ddof=0) != 0 else 0
        )

# -------------------------------------------------------------
# üìò Global Logistic Regression (with project as fixed effect)
# -------------------------------------------------------------
formula = "change_prone ~ LanguageEnvy + CrossRespDecl + ShotgunSurgery + EncapsulationBypass + C(project)"
model = smf.logit(formula=formula, data=all_data).fit()

print("\n\nüîç GLOBAL LOGISTIC REGRESSION RESULTS")
print(model.summary())

# Save coefficients and p-values
summary_df = pd.DataFrame({
    "Variable": model.params.index,
    "Coefficient": model.params.values,
    "p-value": model.pvalues.values,
    "Odds Ratio": model.params.apply(lambda x: round(np.exp(x), 3))
})

# Filter only smell variables for interpretability
smell_vars = ["LanguageEnvy", "CrossRespDecl", "ShotgunSurgery", "EncapsulationBypass"]
significant_vars = summary_df[summary_df["Variable"].isin(smell_vars) & (summary_df["p-value"] < 0.05)]

print("\n\nüìà SIGNIFICANT PREDICTORS (p < 0.05):")
print(significant_vars if not significant_vars.empty else "None")

# Save the results to a CSV file
output_path = "new smells/On extended dataset/global_logistic_change_results.csv"
summary_df.to_csv(output_path, index=False)
print(f"\nüíæ Results saved to: {output_path}")
