In [None]:
#changed files using pydriller
import os
import csv
from pydriller import Repository
from git import Repo, GitCommandError
from collections import defaultdict
from datetime import datetime

def detect_fault_inducing_commits(repo_path, release_pairs, output_dir="buggy_files_reports"):
    """
    Enhanced fault-inducing commit detection with better blame handling and debugging.
    """
    os.makedirs(output_dir, exist_ok=True)
    git_repo = Repo(repo_path)
    results = {}
    project_name = repo_path.split("/")[-1]
    if project_name=="vlc-android":
        project_name="vlc"

    for start_release, end_release in release_pairs:
        try:
            print(f"\nAnalyzing {start_release} to {end_release}")
            
            # Get commit range
            start_commit = git_repo.tags[start_release].commit
            end_commit = git_repo.tags[end_release].commit
            
            csv_filename = os.path.join(output_dir, f"sqlite-{start_release}.csv")
            buggy_files = set()
            fix_count = 0
            
            with open(csv_filename, 'w', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=[
                    'buggy_file_path', 'fix_commit_hash', 'fix_date',
                    'buggy_commit_hash', 'buggy_date', 'fix_message'
                ])
                writer.writeheader()
                
                # Traverse commits with PyDriller
                for commit in Repository(
                    repo_path,
                    since=start_commit.committed_datetime,
                    to=end_commit.committed_datetime,
                    only_modifications_with_file_types=['.java', '.cpp', '.c', '.h']  # Filter by file type
                ).traverse_commits():
                    
#                     if not is_fix_commit(commit.msg):
#                         continue
                    
                    fix_count += 1
                    print(f"  Fix commit: {commit.hash[:8]} - {commit.msg[:50]}...")
                    
                    for modified_file in commit.modified_files:
                        if not modified_file.new_path:
                            continue
                            
                        try:
                            # Get previous version of the file
                            old_path = modified_file.old_path or modified_file.new_path
                            previous_contents = git_repo.git.show(f"{commit.hash}^:{old_path}")
                            current_contents = git_repo.git.show(f"{commit.hash}:{modified_file.new_path}")
                            
                            # Get changed lines
                            diff = get_changed_lines(previous_contents, current_contents)
                            if not diff:
                                continue
                                
                            # Find blame for changed lines
                            blame_output = git_repo.git.blame(
                                '-w', '-l', '-p',  # -w ignores whitespace, -l shows long hashes
                                f"{commit.hash}^",  # Look at parent commit
                                '--', modified_file.new_path
                            )
                            
                            buggy_commits = parse_blame_for_lines(blame_output, diff)
                            
                            for buggy_hash in buggy_commits:
                                try:
                                    buggy_commit = git_repo.commit(buggy_hash)
                                    # Prefer old_path for fault-inducing context, fallback to new_path
                                    buggy_file_path = modified_file.old_path or modified_file.new_path

                                    buggy_files.add(buggy_file_path)
                                    writer.writerow({
                                        'buggy_file_path': buggy_file_path,
                                        'fix_commit_hash': commit.hash,
                                        'fix_date': commit.committer_date,
                                        'buggy_commit_hash': buggy_hash,
                                        'buggy_date': buggy_commit.committed_datetime,
                                        'fix_message': commit.msg[:200].replace('\n', ' ')
                                    })

                                    print(f"    Found buggy commit: {buggy_hash[:8]} for {modified_file.new_path}")
                                    
                                except Exception as e:
                                    print(f"    Error processing buggy commit: {str(e)}")
                                    continue
                                    
                        except GitCommandError as e:
                            print(f"    Error processing {modified_file.new_path}: {str(e)}")
                            continue
            
            print(f"\nSummary for {start_release} to {end_release}:")
            print(f"  Fix commits analyzed: {fix_count}")
            print(f"  Buggy files found: {len(buggy_files)}")
            results[(start_release, end_release)] = buggy_files
            
        except Exception as e:
            print(f"Error processing {start_release}-{end_release}: {str(e)}")
            continue
    
    return results

def get_changed_lines(old_content, new_content):
    """Identify changed lines between two file versions"""
    old_lines = old_content.splitlines()
    new_lines = new_content.splitlines()
    diff = []
    
    for i, (old_line, new_line) in enumerate(zip(old_lines, new_lines)):
        if old_line != new_line:
            diff.append(i+1)  # Line numbers start at 1
    
    # Handle added/removed lines at the end
    len_diff = len(new_lines) - len(old_lines)
    if len_diff > 0:
        diff.extend(range(len(old_lines)+1, len(new_lines)+1))
    
    return diff

def parse_blame_for_lines(blame_output, target_lines):
    """Parse blame output for specific line numbers."""
    commits = set()
    current_line = 0
    commit_hash = None  # Ensure this is always defined

    for line in blame_output.split('\n'):
        if len(line) >= 40 and re.match(r'^[0-9a-f]{40}', line):  # New commit hash line
            commit_hash = line.split()[0]
        elif line.startswith('filename '):
            current_line += 1
        elif line.startswith('\t'):
            current_line += 1
            if current_line in target_lines and commit_hash:
                commits.add(commit_hash)

    return commits


def is_fix_commit(commit_message):
    """Enhanced fix commit detection"""
    error_keywords = [
        "fix", "crash", "resolves", "regression", "fall back", "assertion", "coverity",
    "reproducible", "stack-wanted", "steps-wanted", "testcase", "fail", "npe", "except",
    "broken", "bug", "differential testing", "error", "address sanitizer", "hang",
    "perma orange", "random orange", "intermittent", "steps to reproduce", "leak",
    "stack trace", "heap overflow", "freeze", "problem", "overflow", "avoid", "issue",
    "workaround", "break", "stop"
    ]
    
    lower_msg = commit_message.lower()
    
    # More sophisticated detection
    has_bug_number = any(word.isdigit() for word in lower_msg.split())
    has_issue_ref = '#' in lower_msg or 'issue' in lower_msg
    
    return any(kw in lower_msg for kw in error_keywords) or has_bug_number or has_issue_ref
# Define the release pairs you want to analyze
release_pairs_to_analyze = [
#   javacpp  ("0.5", "0.9"),
#         ("0.9", "1.1"),
#         ("1.1", "1.2"),
#         ("1.2", "1.2.1"),
#         ("1.2.1", "1.2.7"),
#         ("1.2.7", "1.3"),
#         ("1.3", "1.3.2"),
#         ("1.3.2", "1.4"),
#         ("1.4", "1.4.2"),
#         ("1.4.2", "1.4.4"),
#         ("1.4.4", "1.5"),
#         ("1.5", "1.5.1-1"),
#         ("1.5.1-1", "1.5.2"),
#       rocksdb ("v5.0.2", "v5.4.6"),
#         ("v5.4.6", "v5.6.2"),
#         ("v5.6.2", "v5.9.2"),
#         ("v5.9.2", "v5.11.2"),
#         ("v5.11.2", "v5.14.3"),
#         ("v5.14.3", "v5.17.2"),
#         ("v5.17.2", "v5.18.3"),
#         ("v5.18.3", "v6.1.1"),
#         ("v6.1.1", "v6.2.2"),
#         ("v6.2.2", "v6.2.4")
#    jpype   ("v0.5.4.5", "v0.5.5.1"),
#         ("v0.5.5.1", "v0.5.5.4"),
#         ("v0.5.5.4", "v0.5.6"),
#         ("v0.5.6", "v0.5.7"),
#         ("v0.5.7", "v0.6.0"),
#         ("v0.6.0", "v0.6.1"),
#         ("v0.6.1", "v0.6.2"),
#         ("v0.6.2", "v0.6.3"),
#         ("v0.6.3", "v0.7"),
#         ("v0.7", "v0.7.1"),
#         ("v0.7.1", "v0.7.2")
#      realm-java  ("v0.90.0", "v1.2.0"),
#        ("v1.2.0", "v2.3.2"),
#         ("v2.3.2", "v3.7.2"),
#         ("v3.7.2", "v4.4.0"),
#         ("v4.4.0", "v5.4.0"),
#         ("v5.4.0", "v5.7.1"),
#         ("v5.7.1", "v5.9.0"),
#         ("v5.9.0", "v5.11.0"),
#         ("v5.11.0", "v5.15.0"),
#         ("v5.15.0", "v6.0.0"),
#      zstd-jni  ("v0.4.4", "v1.3.0-1"),
#        ("v1.3.0-1", "v1.3.2-2"),
#         ("v1.3.2-2", "v1.3.3-1"),
#         ("v1.3.3-1", "v1.3.4-1"),
#         ("v1.3.4-1", "v1.3.4-8"),
#         ("v1.3.4-8", "v1.3.5-3"),
#         ("v1.3.5-3", "v1.3.7-1"),
#         ("v1.3.7-1", "v1.3.8-1"),
#         ("v1.3.8-1", "v1.4.0-1"),
#         ("v1.4.0-1", "v1.4.2-1"),
#         ("v1.4.2-1", "v1.4.4-3")
#     conscrypt  ("1.0.0.RC2", "1.0.0.RC8"),
#         ("1.0.0.RC8", "1.0.0.RC11"),
#         ("1.0.0.RC11", "1.0.0.RC14"),
#         ("1.0.0.RC14", "1.0.1"),
#         ("1.0.1", "1.0.2"),
#         ("1.0.2", "1.1.1"),
#         ("1.1.1", "1.2.0"),
#         ("1.2.0", "1.4.2"),
#         ("1.4.2", "2.1.0"),
#         ("2.1.0", "2.2.1")
#  java-smt   ("0.1", "0.3"),
#     ("0.3", "0.5"),
  #   ("0.5", "0.60"),
#     ("0.60", "1.0.1"),
#         ("1.0.1", "2.0.0"),
#         ("2.0.0", "2.0.0-alpha"),
#     ("2.0.0-alpha", "2.2.0"),
     #    ("2.2.0", "3.0.0"),
#     ("3.0.0", "3.1.0"),
#         ("3.1.0", "3.3.0")
# vlc  ("2.5.4", "3.0.0"),
#     ("3.0.0", "3.0.11"),
#         ("3.0.11", "3.0.13"),
#         ("3.0.13", "3.0.92"),
#         ("3.0.92", "3.0.96"),
#         ("3.0.96", "3.1.0"),
#         ("3.1.0", "3.1.2"),
#         ("3.1.2", "3.1.6"),
#         ("3.1.6", "3.1.7"),
#         ("3.1.7", "3.2.2")
# pljava    ("V1_2_0", "V1_3_0"),
#     ("V1_3_0", "V1_4_0"),
#     ("V1_4_0", "V1_4_2"),
#     ("V1_4_2", "V1_4_3"),
#   ("V1_4_3", "REL1_5_STABLE-BASE"),
#         ("REL1_5_STABLE-BASE", "V1_5_0b3"),
#         ("V1_5_0b3", "V1_5_0"),
#         ("V1_5_0", "V1_5_1b1"),
#         ("V1_5_1b1", "V1_5_1b2"),
#         ("V1_5_1b2", "V1_5_2"),
#         ("V1_5_2", "V1_5_3"),
#         ("V1_5_3", "V1_5_5")
   #sqlite
    ("3.42.0.1", "3.44.0.0"),
    ("3.44.0.0", "3.45.0.0"),
    ("3.45.0.0", "3.45.2.0"),
    ("3.45.2.0", "3.46.0.0"),
    ("3.46.0.0", "3.46.1.1"),
    ("3.46.1.1", "3.47.0.0"),
    ("3.47.0.0", "3.47.2.0"),
    ("3.47.2.0", "3.49.0.0"),
    ("3.49.0.0", "3.50.1.0"),
    ("3.50.1.0", "3.50.3.0")
    #jni-bind
#     ("Release-0.8.0-alpha","Release-0.9.1-alpha"),
#     ("Release-0.9.1-alpha","Release-0.9.3-alpha"),
#     ("Release-0.9.3-alpha","Release-0.9.6-alpha"),
#     ("Release-0.9.6-alpha","Release-0.9.7-alpha"),
#     ("Release-0.9.7-alpha","Release-0.9.8-alpha"),
#     ("Release-0.9.8-alpha","Release-0.9.9-alpha"),
#     ("Release-0.9.9-alpha","Release-1.0.0-beta"),
#     ("Release-1.0.0-beta","Release-1.1.0-beta"),
#     ("Release-1.1.0-beta","Release-1.1.2-beta"),
#     ("Release-1.1.2-beta","Release-1.2.3")
    #Monero-java
    
#     ("v0.8.9","v0.8.10"),
#     ("v0.8.10","v0.8.13"),
#     ("v0.8.13","v0.8.17"),
#     ("v0.8.17","v0.8.24"),
#     ("v0.8.24","v0.8.31"),
#     ("v0.8.31","v0.8.35"),
#     ("v0.8.35","v0.8.36"),
#     ("v0.8.36","v0.8.37"),
#     ("v0.8.37","v0.8.38"),
#     ("v0.8.38","v0.8.39")
    #webrtc
#     ("v0.2.0", "v0.3.0"),
#     ("v0.3.0", "v0.4.0"),
#     ("v0.4.0", "v0.6.0"),
#     ("v0.6.0", "v0.7.0"),
#     ("v0.7.0", "v0.8.0"),
#     ("v0.8.0", "v0.10.0"),
#     ("v0.10.0", "v0.11.0"),
#     ("v0.11.0", "v0.12.0"),
#     ("v0.12.0", "v0.13.0"),
#     ("v0.13.0", "v0.14.0")
    #wolfcrypt
#     ("v1.0.0-stable", "v1.1.0-stable"),
#     ("v1.1.0-stable", "v1.2.0-stable"),
#     ("v1.2.0-stable", "v1.3.0-stable"),
#     ("v1.3.0-stable", "v1.5.0-stable"),
#     ("v1.5.0-stable", "v1.6.0-stable"),
#     ("v1.6.0-stable", "v1.7.0-stable"),
#     ("v1.7.0-stable", "v1.8.0-stable")
    #wolfssl
#     ("v1.4.0-stable", "v1.5.0-stable"),
#     ("v1.5.0-stable", "v1.6.0-stable"),
#     ("v1.6.0-stable", "v1.8.0-stable"),
#     ("v1.8.0-stable", "v1.9.0-stable"),
#     ("v1.9.0-stable", "v1.11.0-stable"),
#     ("v1.11.0-stable", "v1.12.0-stable"),
#     ("v1.12.0-stable", "v1.12.2"),
#     ("v1.12.2", "v1.13.0-stable"),
#     ("v1.13.0-stable", "v1.14.0-stable"),
#     ("v1.14.0-stable", "v1.15.0-stable")
    
]

# Run the analysis
results = detect_fault_inducing_commits(
    repo_path="revision projects/sqlite/3.50.3.0",
    release_pairs=release_pairs_to_analyze,
    output_dir="buggy_smelly/revision/general_changed"
)

# Access results programmatically if needed
for release_pair, buggy_files in results.items():
    print(f"Between {release_pair[0]} and {release_pair[1]}, found {len(buggy_files)} buggy files")

In [None]:
#faulty files using pydriller and fault inducing commits
import os
import csv
from pydriller import Repository
from git import Repo, GitCommandError
from collections import defaultdict
from datetime import datetime

def detect_fault_inducing_commits(repo_path, release_pairs, output_dir="buggy_files_reports"):
    """
    Enhanced fault-inducing commit detection with better blame handling and debugging.
    """
    os.makedirs(output_dir, exist_ok=True)
    git_repo = Repo(repo_path)
    results = {}
    project_name = repo_path.split("/")[-1]
    if project_name=="vlc-android":
        project_name="vlc"

    for start_release, end_release in release_pairs:
        try:
            print(f"\nAnalyzing {start_release} to {end_release}")
            
            # Get commit range
            start_commit = git_repo.tags[start_release].commit
            end_commit = git_repo.tags[end_release].commit
            
            csv_filename = os.path.join(output_dir, f"wolfssl-{start_release}.csv")
            buggy_files = set()
            fix_count = 0
            
            with open(csv_filename, 'w', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=[
                    'buggy_file_path', 'fix_commit_hash', 'fix_date',
                    'buggy_commit_hash', 'buggy_date', 'fix_message'
                ])
                writer.writeheader()
                
                # Traverse commits with PyDriller
                for commit in Repository(
                    repo_path,
                    since=start_commit.committed_datetime,
                    to=end_commit.committed_datetime,
                    only_modifications_with_file_types=['.java', '.cpp', '.c', '.h']  # Filter by file type
                ).traverse_commits():
                    
                    if not is_fix_commit(commit.msg):
                        continue
                    
                    fix_count += 1
                    print(f"  Fix commit: {commit.hash[:8]} - {commit.msg[:50]}...")
                    
                    for modified_file in commit.modified_files:
                        if not modified_file.new_path:
                            continue
                            
                        try:
                            # Get previous version of the file
                            old_path = modified_file.old_path or modified_file.new_path
                            previous_contents = git_repo.git.show(f"{commit.hash}^:{old_path}")
                            current_contents = git_repo.git.show(f"{commit.hash}:{modified_file.new_path}")
                            
                            # Get changed lines
                            diff = get_changed_lines(previous_contents, current_contents)
                            if not diff:
                                continue
                                
                            # Find blame for changed lines
                            blame_output = git_repo.git.blame(
                                '-w', '-l', '-p',  # -w ignores whitespace, -l shows long hashes
                                f"{commit.hash}^",  # Look at parent commit
                                '--', modified_file.new_path
                            )
                            
                            buggy_commits = parse_blame_for_lines(blame_output, diff)
                            
                            for buggy_hash in buggy_commits:
                                try:
                                    buggy_commit = git_repo.commit(buggy_hash)
                                    # Prefer old_path for fault-inducing context, fallback to new_path
                                    buggy_file_path = modified_file.old_path or modified_file.new_path

                                    buggy_files.add(buggy_file_path)
                                    writer.writerow({
                                        'buggy_file_path': buggy_file_path,
                                        'fix_commit_hash': commit.hash,
                                        'fix_date': commit.committer_date,
                                        'buggy_commit_hash': buggy_hash,
                                        'buggy_date': buggy_commit.committed_datetime,
                                        'fix_message': commit.msg[:200].replace('\n', ' ')
                                    })

                                    print(f"    Found buggy commit: {buggy_hash[:8]} for {modified_file.new_path}")
                                    
                                except Exception as e:
                                    print(f"    Error processing buggy commit: {str(e)}")
                                    continue
                                    
                        except GitCommandError as e:
                            print(f"    Error processing {modified_file.new_path}: {str(e)}")
                            continue
            
            print(f"\nSummary for {start_release} to {end_release}:")
            print(f"  Fix commits analyzed: {fix_count}")
            print(f"  Buggy files found: {len(buggy_files)}")
            results[(start_release, end_release)] = buggy_files
            
        except Exception as e:
            print(f"Error processing {start_release}-{end_release}: {str(e)}")
            continue
    
    return results

def get_changed_lines(old_content, new_content):
    """Identify changed lines between two file versions"""
    old_lines = old_content.splitlines()
    new_lines = new_content.splitlines()
    diff = []
    
    for i, (old_line, new_line) in enumerate(zip(old_lines, new_lines)):
        if old_line != new_line:
            diff.append(i+1)  # Line numbers start at 1
    
    # Handle added/removed lines at the end
    len_diff = len(new_lines) - len(old_lines)
    if len_diff > 0:
        diff.extend(range(len(old_lines)+1, len(new_lines)+1))
    
    return diff

def parse_blame_for_lines(blame_output, target_lines):
    """Parse blame output for specific line numbers."""
    commits = set()
    current_line = 0
    commit_hash = None  # Ensure this is always defined

    for line in blame_output.split('\n'):
        if len(line) >= 40 and re.match(r'^[0-9a-f]{40}', line):  # New commit hash line
            commit_hash = line.split()[0]
        elif line.startswith('filename '):
            current_line += 1
        elif line.startswith('\t'):
            current_line += 1
            if current_line in target_lines and commit_hash:
                commits.add(commit_hash)

    return commits


def is_fix_commit(commit_message):
    """Enhanced fix commit detection"""
    error_keywords = [
        "fix", "crash", "resolves", "regression", "fall back", "assertion", "coverity",
    "reproducible", "stack-wanted", "steps-wanted", "testcase", "fail", "npe", "except",
    "broken", "bug", "differential testing", "error", "address sanitizer", "hang",
    "perma orange", "random orange", "intermittent", "steps to reproduce","assertion", "leak",
    "stack trace", "heap overflow", "freez","str", "problem", "overflow", "avoid", "issue",
    "workaround", "break", "stop"
    ]
    
    lower_msg = commit_message.lower()
    
    # More sophisticated detection
    has_bug_number = any(word.isdigit() for word in lower_msg.split())
    has_issue_ref = '#' in lower_msg or 'issue' in lower_msg
    
    return any(kw in lower_msg for kw in error_keywords) or has_bug_number or has_issue_ref
# Define the release pairs you want to analyze
release_pairs_to_analyze = [
#    javacpp ("0.5", "0.9"),
#         ("0.9", "1.1"),
#         ("1.1", "1.2"),
#         ("1.2", "1.2.1"),
#         ("1.2.1", "1.2.7"),
#         ("1.2.7", "1.3"),
#         ("1.3", "1.3.2"),
#         ("1.3.2", "1.4"),
#         ("1.4", "1.4.2"),
#         ("1.4.2", "1.4.4"),
#         ("1.4.4", "1.5"),
#         ("1.5", "1.5.1-1"),
#         ("1.5.1-1", "1.5.2"),

    #     ("v5.0.2", "v5.4.6"),
#         ("v5.4.6", "v5.6.2"),
#         ("v5.6.2", "v5.9.2"),
#         ("v5.9.2", "v5.11.2"),
#         ("v5.11.2", "v5.14.3"),
#         ("v5.14.3", "v5.17.2"),
#         ("v5.17.2", "v5.18.3"),
#         ("v5.18.3", "v6.1.1"),
#         ("v6.1.1", "v6.2.2"),
#         ("v6.2.2", "v6.2.4")
#     jpype ("v0.5.4.5", "v0.5.5.1"),
#         ("v0.5.5.1", "v0.5.5.4"),
#         ("v0.5.5.4", "v0.5.6"),
#         ("v0.5.6", "v0.5.7"),
#         ("v0.5.7", "v0.6.0"),
#         ("v0.6.0", "v0.6.1"),
#         ("v0.6.1", "v0.6.2"),
#         ("v0.6.2", "v0.6.3"),
#         ("v0.6.3", "v0.7"),
      #  ("v0.7", "v0.7.1")
   # ("v0.7.1", "v0.7.2")
  
#     realm-java   ("v0.90.0", "v1.2.0"),
#        ("v1.2.0", "v2.3.2"),
#         ("v2.3.2", "v3.7.2"),
#         ("v3.7.2", "v4.4.0"),
#         ("v4.4.0", "v5.4.0"),
#         ("v5.4.0", "v5.7.1"),
#         ("v5.7.1", "v5.9.0"),
#         ("v5.9.0", "v5.11.0"),
#         ("v5.11.0", "v5.15.0"),
#         ("v5.15.0", "v6.0.0"),
#     zstd-jni   ("v0.4.4", "v1.3.0-1"),
#        ("v1.3.0-1", "v1.3.2-2"),
#         ("v1.3.2-2", "v1.3.3-1"),
#         ("v1.3.3-1", "v1.3.4-1"),
#         ("v1.3.4-1", "v1.3.4-8"),
#         ("v1.3.4-8", "v1.3.5-3"),
#         ("v1.3.5-3", "v1.3.7-1"),
#         ("v1.3.7-1", "v1.3.8-1"),
#         ("v1.3.8-1", "v1.4.0-1"),
#         ("v1.4.0-1", "v1.4.2-1"),
#         ("v1.4.2-1", "v1.4.4-3")
#      conscrypt ("1.0.0.RC2", "1.0.0.RC8"),
#         ("1.0.0.RC8", "1.0.0.RC11"),
#         ("1.0.0.RC11", "1.0.0.RC14"),
#         ("1.0.0.RC14", "1.0.1"),
#         ("1.0.1", "1.0.2"),
#         ("1.0.2", "1.1.1"),
#         ("1.1.1", "1.2.0"),
#         ("1.2.0", "1.4.2"),
#         ("1.4.2", "2.1.0"),
#         ("2.1.0", "2.2.1")
# java-smt ("0.1", "0.3"),
#     ("0.3", "0.5"),
 #   ("0.5", "0.60"),
#     ("0.60", "1.0.1"),
#         ("1.0.1", "2.0.0"),
#         ("2.0.0", "2.0.0-alpha"),
#     ("2.0.0-alpha", "2.2.0"),
  #      ("2.2.0", "3.0.0"),
#     ("3.0.0", "3.1.0"),
#         ("3.1.0", "3.3.0")
# vlv ("2.5.4", "3.0.0"),
#     ("3.0.0", "3.0.11"),
#         ("3.0.11", "3.0.13"),
#         ("3.0.13", "3.0.92"),
#         ("3.0.92", "3.0.96"),
#         ("3.0.96", "3.1.0"),
#         ("3.1.0", "3.1.2"),
#         ("3.1.2", "3.1.6"),
#         ("3.1.6", "3.1.7"),
#         ("3.1.7", "3.2.2")
# pljava  ("V1_2_0", "V1_3_0"),
#     ("V1_3_0", "V1_4_0"),
#     ("V1_4_0", "V1_4_2"),
#     ("V1_4_2", "V1_4_3"),
#   ("V1_4_3", "REL1_5_STABLE-BASE"),
#         ("REL1_5_STABLE-BASE", "V1_5_0b3"),
#         ("V1_5_0b3", "V1_5_0"),
#         ("V1_5_0", "V1_5_1b1"),
#         ("V1_5_1b1", "V1_5_1b2"),
#         ("V1_5_1b2", "V1_5_2"),
#         ("V1_5_2", "V1_5_3"),
#         ("V1_5_3", "V1_5_5")
    #sqlite
#     ("3.42.0.1", "3.44.0.0"),
#     ("3.44.0.0", "3.45.0.0"),
#     ("3.45.0.0", "3.45.2.0"),
#     ("3.45.2.0", "3.46.0.0"),
#     ("3.46.0.0", "3.46.1.1"),
#     ("3.46.1.1", "3.47.0.0"),
#     ("3.47.0.0", "3.47.2.0"),
#     ("3.47.2.0", "3.49.0.0"),
#     ("3.49.0.0", "3.50.1.0"),
#     ("3.50.1.0", "3.50.3.0")
    #jni-bind
#     ("Release-0.8.0-alpha","Release-0.9.1-alpha"),
#     ("Release-0.9.1-alpha","Release-0.9.3-alpha"),
#     ("Release-0.9.3-alpha","Release-0.9.6-alpha"),
#     ("Release-0.9.6-alpha","Release-0.9.7-alpha"),
#     ("Release-0.9.7-alpha","Release-0.9.8-alpha"),
#     ("Release-0.9.8-alpha","Release-0.9.9-alpha"),
#     ("Release-0.9.9-alpha","Release-1.0.0-beta"),
#     ("Release-1.0.0-beta","Release-1.1.0-beta"),
#     ("Release-1.1.0-beta","Release-1.1.2-beta"),
#     ("Release-1.1.2-beta","Release-1.2.3")
    #Monero-java
    
#     ("v0.8.9","v0.8.10"),
#     ("v0.8.10","v0.8.13"),
#     ("v0.8.13","v0.8.17"),
#     ("v0.8.17","v0.8.24"),
#     ("v0.8.24","v0.8.31"),
#     ("v0.8.31","v0.8.35"),
#     ("v0.8.35","v0.8.36"),
#     ("v0.8.36","v0.8.37"),
#     ("v0.8.37","v0.8.38"),
#     ("v0.8.38","v0.8.39")
    #webrtc
#     ("v0.2.0", "v0.3.0"),
#     ("v0.3.0", "v0.4.0"),
#     ("v0.4.0", "v0.6.0"),
#     ("v0.6.0", "v0.7.0"),
#     ("v0.7.0", "v0.8.0"),
#     ("v0.8.0", "v0.10.0"),
#     ("v0.10.0", "v0.11.0"),
#     ("v0.11.0", "v0.12.0"),
#     ("v0.12.0", "v0.13.0"),
#     ("v0.13.0", "v0.14.0")
    #wolfcrypt
#     ("v1.0.0-stable", "v1.1.0-stable"),
#     ("v1.1.0-stable", "v1.2.0-stable"),
#     ("v1.2.0-stable", "v1.3.0-stable"),
#     ("v1.3.0-stable", "v1.5.0-stable"),
#     ("v1.5.0-stable", "v1.6.0-stable"),
#     ("v1.6.0-stable", "v1.7.0-stable"),
#     ("v1.7.0-stable", "v1.8.0-stable")
    #wolfssl
    ("v1.4.0-stable", "v1.5.0-stable"),
    ("v1.5.0-stable", "v1.6.0-stable"),
    ("v1.6.0-stable", "v1.8.0-stable"),
    ("v1.8.0-stable", "v1.9.0-stable"),
    ("v1.9.0-stable", "v1.11.0-stable"),
    ("v1.11.0-stable", "v1.12.0-stable"),
    ("v1.12.0-stable", "v1.12.2"),
    ("v1.12.2", "v1.13.0-stable"),
    ("v1.13.0-stable", "v1.14.0-stable"),
    ("v1.14.0-stable", "v1.15.0-stable"),
    
]

# Run the analysis
results = detect_fault_inducing_commits(
    repo_path="revision projects/wolfssl/v1.15.0-stable",
    release_pairs=release_pairs_to_analyze,
    output_dir="buggy_smelly/revision/general_faulty"
)

# Access results programmatically if needed
for release_pair, buggy_files in results.items():
    print(f"Between {release_pair[0]} and {release_pair[1]}, found {len(buggy_files)} buggy files")

In [None]:
#filter out jni files from the general changed files and faulty files
import os
import pandas as pd
import csv

# Modifying the jni file names because it contains the repo name and project name also which is not there in smelly and faulty files
def transform_jni_path(path, project_name):
    path = str(path).strip()
    path = path.replace("revision projects/extra projects/", "")
    path = path.replace("revision projects/", "")
    path = path.replace(f"{project_name}/","")
    return path


# Directories
#changed_dir = "buggy_smelly/abidi/general_changed_faulty/changed_files"
changed_dir = "buggy_smelly/revision/general_faulty"
#buggy_dir = "buggy_smelly/abidi/general_changed_faulty/fault_fixing_changed_files"
jni_dir = "buggy_smelly/revision/JNIfiles"  # <-- JNI files by project
output_dir = "buggy_smelly/revision/FaultyJNI"

# Get sorted file lists
changed_files = sorted(os.listdir(changed_dir))
#buggy_files = sorted(os.listdir(buggy_dir))
jni_files = sorted(os.listdir(jni_dir))

print(len(changed_files))
print(len(jni_files))
# Ensure the same number of files exist
#assert len(changed_files) == len(jni_files), "Mismatch in number of files across directories!"

for changed_file, jni_file in zip(changed_files, jni_files):
        project_name = changed_file.replace(".csv", "")

        changed_path = os.path.join(changed_dir, changed_file)
        #buggy_path = os.path.join(buggy_dir, buggy_file)
        jni_path = os.path.join(jni_dir, changed_file)

        # Read CSVs
        changed_df = pd.read_csv(changed_path)
        #buggy_df = pd.read_csv(buggy_path)
        jni_df = pd.read_csv(jni_path)

        # Get sets of file names (first column assumed)
        changed_set = set(changed_df.iloc[:, 0].dropna())
        #buggy_set = set(buggy_df.iloc[:, 0].dropna())
        jni_set = set(transform_jni_path(p, project_name) for p in jni_df.iloc[:, 0].dropna())


        # Filter only those in the JNI list
        changed_jni = changed_set & jni_set
        #buggy_jni = buggy_set & jni_set

     
        
        
        with open(os.path.join(output_dir, f"{project_name}.csv"), 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["File Path"])
            for fpath in sorted(changed_jni):
                writer.writerow([fpath])





In [None]:
#to compute, smelly, change/buggy, and smelly_buggy files 
# this checks whether the smelly filename contains the jni file name because smelly filename also contains system folder names and project and release names
import os
import pandas as pd
import csv

# Normalization used for JNI / smelly / buggy paths
def transform_jni_path(path, project_name):
    path = str(path).strip()
    if not path:
        return ""
    # unify separators
    path = path.replace("\\", "/")
    # remove common prefixes (keep / adjust as needed)
    path = path.replace("/home/shahrukh/smellDetection/Detection/revision projects/extra projects/", "")
    path = path.replace("/home/shahrukh/smellDetection/Detection/cloned_abidi/", "")
    path = path.replace("/home/abhaya/shahrukh/cloned_abidi/remainingForEILC/", "")
    path = path.replace("/home/abhaya/shahrukh/cloned_abidi/", "")
    path = path.replace("cloned_abidi/", "")
    # remove project name if present
    path = path.replace(f"{project_name}/", "")
    # trim leading/trailing ./ and slashes
    path = path.lstrip("./").strip("/")
    return path

# Directories
smelly_dir = "buggy_smelly/revision/generalSmelly"
buggy_dir = "buggy_smelly/revision/ChangedJNI"
jni_dir = "buggy_smelly/revision/JNIfiles"  # <-- JNI files by project
output_csv = "buggy_smelly/revision/fisher/data_for_change_analysis.csv"

# Get sorted file lists (only csv files)
smelly_files = sorted([f for f in os.listdir(smelly_dir) if f.endswith(".csv")])
buggy_files = sorted([f for f in os.listdir(buggy_dir) if f.endswith(".csv")])
jni_files = sorted([f for f in os.listdir(jni_dir) if f.endswith(".csv")])

# Make sure we iterate consistently: we'll zip the three lists.
# If some projects are missing in one dir, consider aligning by project names instead.
with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        "Project Name", "JNI Files (count)", "Smelly JNI (count)", "Buggy JNI (count)",
        "Smelly But Not Buggy", "Buggy But Not Smelly", "Both Smelly & Buggy", "Neither"
    ])

    for smelly_file, buggy_file, jni_file in zip(smelly_files, buggy_files, jni_files):
        project_name = smelly_file.replace(".csv", "")

        smelly_path = os.path.join(smelly_dir, smelly_file)
        buggy_path = os.path.join(buggy_dir, buggy_file)   # fixed: use buggy_file
        jni_path = os.path.join(jni_dir, jni_file)         # fixed: use jni_file

        # Read CSVs safely (if file is empty or has issues, continue)
        try:
            smelly_df = pd.read_csv(smelly_path, header=None, dtype=str)
        except Exception as e:
            print(f"Warning: couldn't read {smelly_path}: {e}")
            smelly_df = pd.DataFrame()

        try:
            buggy_df = pd.read_csv(buggy_path, header=None, dtype=str)
        except Exception as e:
            print(f"Warning: couldn't read {buggy_path}: {e}")
            buggy_df = pd.DataFrame()

        try:
            jni_df = pd.read_csv(jni_path, header=None, dtype=str)
        except Exception as e:
            print(f"Warning: couldn't read {jni_path}: {e}")
            jni_df = pd.DataFrame()

        # Extract lists (first column assumed). If the CSV is empty, produce empty list.
        smelly_raw = smelly_df.iloc[:, 0].dropna().tolist() if smelly_df.shape[1] >= 1 else []
        buggy_raw  = buggy_df.iloc[:, 0].dropna().tolist()  if buggy_df.shape[1] >= 1 else []
        jni_raw    = jni_df.iloc[:, 0].dropna().tolist()    if jni_df.shape[1] >= 1 else []

        # Normalize all paths (use same normalizer so containment checks are consistent)
        smelly_paths = [transform_jni_path(p, project_name) for p in smelly_raw]
        buggy_paths  = [transform_jni_path(p, project_name) for p in buggy_raw]
        jni_paths    = [transform_jni_path(p, project_name) for p in jni_raw]

        # Unique JNI set
        jni_set = set(p for p in jni_paths if p)

        # Determine which JNI files are smelly: if any smelly_path contains the jni filename
        smelly_jni = set()
        for jni in jni_set:
            # containment check: jni contained in any smelly path
            if any(jni in sp for sp in smelly_paths):
                smelly_jni.add(jni)

        # Determine which JNI files are buggy: if any buggy_path contains the jni filename
        buggy_jni = set()
        for jni in jni_set:
            if any(jni in bp for bp in buggy_paths):
                buggy_jni.add(jni)

        # Compute metrics
        jni_count = len(jni_set)
        smelly_count = len(smelly_jni)
        buggy_count = len(buggy_jni)
        smelly_not_buggy = len(smelly_jni - buggy_jni)
        buggy_not_smelly = len(buggy_jni - smelly_jni)
        both = len(smelly_jni & buggy_jni)
        neither = jni_count - (smelly_not_buggy + buggy_not_smelly + both)

        print(f"Processed {project_name}: JNI={jni_count}, Smelly={smelly_count}, Buggy={buggy_count}, "
              f"Smelly_Not_Buggy={smelly_not_buggy}, Buggy_Not_Smelly={buggy_not_smelly}, Both={both}, Neither={neither}")

        writer.writerow([
            project_name, jni_count, smelly_count, buggy_count,
            smelly_not_buggy, buggy_not_smelly, both, neither
        ])

print("✅ Processing complete. Results saved to:", output_csv)


In [None]:
#fisher exact test
import pandas as pd
import scipy.stats as stats
import numpy as np

def fisher_test_with_adjustment(a, b, c, d, confidence_level=0.95, alternative='greater'):
    """
    Computes Fisher's Exact Test, Odds Ratio, p-value, and Confidence Interval with Haldane-Anscombe correction.

    Args:
        a, b, c, d: Cell counts of 2x2 contingency table.
        confidence_level: CI level (default = 95%).
        alternative: 'greater' (default), 'less', or 'two-sided'.

    Returns:
        Tuple: (odds_ratio, p_value, ci_low, ci_high)
    """
    # Apply Haldane-Anscombe correction if any value is zero
    if 0 in [a, b, c, d]:
        a, b, c, d = a + 0.5, b + 0.5, c + 0.5, d + 0.5

    # Construct contingency table
    table = [[a, b], [c, d]]

    # Compute Odds Ratio
    try:
        odds_ratio = (a * d) / (b * c)
    except ZeroDivisionError:
        odds_ratio = np.inf if a * d > 0 else 0

    # Compute one-sided Fisher's Exact Test p-value
    try:
        _, p_value = stats.fisher_exact(table, alternative=alternative)
    except:
        p_value = np.nan

    # Compute Confidence Interval using log method
    try:
        log_or = np.log(odds_ratio)
        se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
        z = stats.norm.ppf(1 - (1 - confidence_level) / 2)
        ci_low = np.exp(log_or - z * se_log_or)
        ci_high = np.exp(log_or + z * se_log_or)
    except:
        ci_low, ci_high = np.nan, np.nan

    # Handle extreme OR edge cases
    if odds_ratio == 0:
        ci_low, ci_high = 0, np.inf
    elif np.isinf(odds_ratio):
        ci_low, ci_high = 0, np.inf

    return odds_ratio, p_value, (ci_low, ci_high)


def process_csv(file_path, output_path=None, alternative='greater'):
    """
    Processes a CSV file and applies Fisher's test on each row.

    Args:
        file_path: Path to input CSV.
        output_path: Path to save output CSV (optional).
        alternative: One-sided alternative hypothesis ('greater' or 'less').

    Returns:
        DataFrame with test results.
    """
    df = pd.read_csv(file_path)

    results = []

    for index, row in df.iterrows():
        project_name = row.get("Project Name", f"Project_{index + 1}")

        # Extract and safely convert contingency table values
        a = pd.to_numeric(row.get("Both Smelly & Buggy"), errors='coerce') or 0
        b = pd.to_numeric(row.get("Smelly But Not Buggy"), errors='coerce') or 0
        c = pd.to_numeric(row.get("Buggy But Not Smelly"), errors='coerce') or 0
        d = pd.to_numeric(row.get("Neither"), errors='coerce') or 0

        if (a + b == 0) or (c + d == 0):
            odds_ratio, p_value, ci = None, None, (None, None)
        else:
            odds_ratio, p_value, ci = fisher_test_with_adjustment(a, b, c, d, alternative=alternative)

        results.append([
            project_name, int(a), int(b), int(c), int(d),
            odds_ratio, p_value, ci[0], ci[1]
        ])

    result_df = pd.DataFrame(results, columns=[
        "Project", "a", "b", "c", "d", 
        "Odds Ratio", "p-value", "CI Lower", "CI Upper"
    ])

    if output_path:
        result_df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

    return result_df


# Example usage
if __name__ == "__main__":
    csv_file = "buggy_smelly/revision/fisher/data_for_fault_analysis.csv"
    output_file = "buggy_smelly/revision/fisher/results_for_fault_analysis.csv"

    df_result = process_csv(csv_file, output_file, alternative='two-sided')
    print(df_result)


In [None]:
#forest plot for fisher results
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

def create_grouped_forest_plot_two_colors(csv_file, title="Multi-language Design Smells: Fault-Proneness by Project"):
    """Create grouped forest plot with ONLY red and gray markers"""
    
    # Read and preprocess data
    df = pd.read_csv(csv_file)
    df[['system', 'release']] = df['Project'].str.split('-', n=1, expand=True)
    plot_df = df.dropna(subset=['ci_lower', 'ci_upper']).copy()
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Get unique systems and create positions
    systems = plot_df['system'].unique()
    y_positions = np.arange(len(systems))
    
    # Plot each system - ONLY TWO COLORS
    for i, system in enumerate(systems):
        system_data = plot_df[plot_df['system'] == system]
        
        # Calculate jitter based on number of releases
        n_releases = len(system_data)
        if n_releases > 1:
            jitter = np.random.normal(0, 0.08, len(system_data))  # Small random jitter
        else:
            jitter = [0]
        
        for j, (idx, row) in enumerate(system_data.iterrows()):
            # Plot confidence interval as a vertical line (thicker)
            ax.plot([row['ci_lower'], row['ci_upper']], 
                    [i + jitter[j], i + jitter[j]], 
                    color='gray', alpha=0.7, linewidth=1, zorder=1)
            
            # Plot odds ratio point - ONLY TWO COLORS
            # MODIFIED: Significant only when p < 0.05 AND ci_lower > 1
            is_significant = (row['p_value'] < 0.05) and (row['ci_lower'] > 1)
            color = 'red' if is_significant else 'gray'  # ONLY RED AND GRAY
            marker = 'D' if is_significant else 'o'
            size = 120 if is_significant else 80  # Increased sizes
            
            ax.scatter(row['Odds Ratio'], i + jitter[j], 
                      color=color, s=size, marker=marker,
                      edgecolor='black', alpha=0.9, zorder=2,
                      linewidth=1.5)  # Thicker borders
    
    # Customize plot with FIXED RANGE
    ax.set_yticks(y_positions)
    ax.set_yticklabels([f"{sys} ({len(plot_df[plot_df['system']==sys])} releases)" 
                       for sys in systems], fontsize=11)
    ax.set_xlabel('Odds Ratio (log scale)', fontsize=13)
    ax.set_ylabel('Project', fontsize=13)
    ax.set_title(title, fontsize=16, pad=20)
    ax.set_xscale('log')
    
    # SET FIXED X-AXIS RANGE: 10^-2 to 10^2
    ax.set_xlim(0.01, 100)  # 10^-2 to 10^2
    
    # Set custom x-ticks for better readability
    ax.set_xticks([0.01, 0.1, 1, 10, 100])
    ax.set_xticklabels(['0.01', '0.1', '1', '10', '100'])
    
    # Reference line
    ax.axvline(x=1, color='red', linestyle='--', alpha=0.8, 
               label='No effect (OR=1)', linewidth=2.5)
    
    # Styling
    ax.grid(True, alpha=0.3, axis='x')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Enhanced legend with larger markers - UPDATED DESCRIPTION
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', markerfacecolor='gray', 
               markersize=12, label='Not significant', markeredgecolor='black', markeredgewidth=1),
        Line2D([0], [0], marker='D', color='w', markerfacecolor='red', 
               markersize=12, label='Significant (p < 0.05 & CI lower > 1)', markeredgecolor='black', markeredgewidth=1),
        Line2D([0], [0], color='red', linestyle='--', label='No effect (OR=1)', linewidth=2)
    ]
    ax.legend(handles=legend_elements, loc='upper left', fontsize=11)
    
    # Add some statistics to the plot
    total_releases = len(plot_df)
    # MODIFIED: Count significant using the new criteria
    significant_releases = len(plot_df[(plot_df['p_value'] < 0.05) & (plot_df['ci_lower'] > 1)])
    
#     # Add text box with statistics
#     textstr = f'Total releases: {total_releases}\nSignificant: {significant_releases} ({significant_releases/total_releases*100:.1f}%)'
#     props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
#     ax.text(0.02, 0.98, textstr, transform=ax.transAxes, fontsize=10,
#             verticalalignment='top', bbox=props)
    
    plt.tight_layout()
    return fig, plot_df

# Usage
fig, processed_df = create_grouped_forest_plot_two_colors("buggy_smelly/combined/fisher_results_fault_analysis.csv")
plt.savefig('buggy_smelly/combined/forest_plot_fault_analysis.pdf', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#Logistic regression (projectwise)
import pandas as pd
import statsmodels.api as sm

# Assuming nativedata is already loaded as a pandas DataFrame
nativedata = pd.read_csv("buggy_smelly/combined/fault LR data combined.csv")
output_path = "buggy_smelly/combined/fault LR result combined.csv"

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load and clean dataset
df = nativedata.copy()

features = [
    'ASM', 'EILC', 'HCL', 'LRA', 'MMM', 'NCOE', 'NHE', 'NSL', 'NURP', 'PEO', 'TMC', 'TMS', 'UNMD', 'UNMI', 'UP',
'loc', 'previous_fixes', 'code_churn'

#     'excessiveInterlangCommunication', 'Toomuchclustring', 'ToomuchScattering',
#     'UnusedMethodDeclaration', 'UnusedMethodImplementation', 'UnusedParameter',
#     'AssumingSafeReturnValue', 'ExcessiveObjects', 'NotHandlingExceptions',
#     'NotCachingObjects', 'NotSecuringLibraries', 'HardCodingLibraries',
#     'NotUsingRelativePath', 'MemoryManagementMismatch', 'LocalReferencesAbuse',
#     'LOC', 'PrevFixing', 'CodeChurn'
]
target = 'Changed'

# Filter rows and columns
X = df[features].dropna()
y = df.loc[X.index, target]

# Step 1: Drop zero-variance features
X = X.loc[:, X.nunique() > 1]

# Step 2: Drop one of highly correlated pairs
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
X = X.drop(columns=to_drop_corr)
print("Dropped (correlation):", to_drop_corr)

# Step 3: Drop high-VIF variables iteratively
def calculate_vif(X_):
    X_vif = sm.add_constant(X_)
    return pd.DataFrame({
        "feature": X_vif.columns,
        "VIF": [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
    })

while True:
    vif_df = calculate_vif(X)
    vif_df = vif_df[vif_df['feature'] != 'const']
    max_vif = vif_df['VIF'].max()
    if max_vif > 10:
        to_drop_vif = vif_df.sort_values('VIF', ascending=False).iloc[0]['feature']
        print(f"Dropped (VIF): {to_drop_vif}, VIF={max_vif:.2f}")
        X = X.drop(columns=[to_drop_vif])
    else:
        break

# Step 4: Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled = sm.add_constant(X_scaled)


# Step 5: Use GLM instead of Logit (more stable), with max iterations increased
model = sm.GLM(y, X_scaled, family=sm.families.Binomial())
result = model.fit(maxiter=100, disp=0)

# Step 6: Output results
print(result.summary())
# Step 6: Output results
#print(result.summary())

# Step 7: Compute and display Odds Ratios with 95% CI
odds_ratios = np.exp(result.params)
conf = result.conf_int()
conf['OR_lower'] = np.exp(conf[0])
conf['OR_upper'] = np.exp(conf[1])
odds_ratios_df = pd.DataFrame({
    'Odds Ratio': odds_ratios,
    'CI Lower': conf['OR_lower'],
    'CI Upper': conf['OR_upper'],
    'p-value': result.pvalues
})

print("\n=== Odds Ratios with 95% CI ===")
print(odds_ratios_df)

# Step 7: Prepare results with guaranteed row alignment
index = result.params.index  # ensures 'const' and all variable names are preserved

summary_df = pd.DataFrame(index=index)
summary_df['Coefficient'] = result.params
summary_df['Odds Ratio'] = np.exp(result.params)
summary_df['p-value'] = result.pvalues

conf = result.conf_int().rename(columns={0: 'CI Lower', 1: 'CI Upper'})
conf['OR Lower'] = np.exp(conf['CI Lower'])
conf['OR Upper'] = np.exp(conf['CI Upper'])
summary_df['95% CI'] = conf.apply(lambda row: f"({row['OR Lower']:.4f}, {row['OR Upper']:.4f})", axis=1)

# Step 8: Write to CSV
summary_df.index.name = 'Variable'

summary_df.to_csv(output_path)
print(f"\n✅ All results written to:\n{output_path}")


In [None]:
# LR for pooled analysis for change-proneness
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def run_mixed_effects_analysis_change(nativedata, target='Changed'):
    """
    """
    df = nativedata.copy()
    
    # Basic features (only design smells, no additional controls)
    features = [
        'ASM', 'EILC', 'HCL', 'LRA', 'MMM', 'NCOE', 'NHE', 'NSL', 'NURP', 
        'PEO', 'TMC', 'TMS', 'UNMD', 'UNMI', 'UP'
    ]
    
    # Add project identifier
    df['project_name'] = df['Project'].str.split('-').str[0]
    
    # Filter and prepare data
    X = df[features + ['project_name']].dropna()
    y = df.loc[X.index, target]
    
    print(f"Total observations: {len(X)}")
    print(f"Projects: {X['project_name'].unique()}")
    print(f"Observations per project:")
    print(X['project_name'].value_counts())
    
    # Since statsmodels doesn't have built-in mixed logit, we'll use fixed effects with project dummies
    # and cluster standard errors by project
    X_fixed = X.drop('project_name', axis=1)
    
    # Add project fixed effects - ensure they're numeric
    project_dummies = pd.get_dummies(X['project_name'], prefix='project', drop_first=True)
    
    # Convert all dummies to int (they might be bool)
    project_dummies = project_dummies.astype(int)
    
    X_fixed = pd.concat([X_fixed, project_dummies], axis=1)
    
    # Convert ALL columns to numeric to be safe
    for col in X_fixed.columns:
        X_fixed[col] = pd.to_numeric(X_fixed[col], errors='coerce')
    
    # Drop any new NaN values created during conversion
    X_fixed = X_fixed.dropna()
    
    # Align y with X_fixed after dropping NaN
    y = y.loc[X_fixed.index]
    
    # No standardization needed since we're only using count variables (design smells)
    # All smell variables are already on similar scales (counts/frequencies)
    
    X_fixed = sm.add_constant(X_fixed)
    
    print(f"Final X shape: {X_fixed.shape}")
    print(f"Final X dtypes: {X_fixed.dtypes.unique()}")
    
    # Fit model with clustered standard errors by project
    model = sm.GLM(y, X_fixed, family=sm.families.Binomial())
    result = model.fit(cov_type='cluster', cov_kwds={'groups': X.loc[X_fixed.index, 'project_name']}, maxiter=100)
    
    return result, X_fixed, X.loc[X_fixed.index, 'project_name']

def save_results_to_csv(result, output_path):
    """
    Save comprehensive regression results to CSV
    """
    # Create comprehensive results dataframe
    results_df = pd.DataFrame({
        'Variable': result.params.index,
        'Coefficient': result.params.values,
        'Std_Error': result.bse.values,
        'z_Value': result.tvalues.values,
        'P_Value': result.pvalues.values,
        'Odds_Ratio': np.exp(result.params.values),
        'CI_Lower': np.exp(result.conf_int()[0].values),
        'CI_Upper': np.exp(result.conf_int()[1].values),
        'Significant_0.05': result.pvalues.values < 0.05,
        'Significant_0.01': result.pvalues.values < 0.01
    })
    
    # Add significance stars
    def add_significance_stars(p_value):
        if p_value < 0.001:
            return '***'
        elif p_value < 0.01:
            return '**'
        elif p_value < 0.05:
            return '*'
        else:
            return ''
    
    results_df['Significance'] = results_df['P_Value'].apply(add_significance_stars)
    
    # Format confidence interval as string
    results_df['95%_CI'] = results_df.apply(
        lambda row: f"[{row['CI_Lower']:.4f}, {row['CI_Upper']:.4f}]", axis=1
    )
    
    # Reorder columns for better readability
    final_columns = [
        'Variable', 'Coefficient', 'Std_Error', 'z_Value', 'P_Value', 
        'Significance', 'Odds_Ratio', '95%_CI', 'CI_Lower', 'CI_Upper',
        'Significant_0.05', 'Significant_0.01'
    ]
    
    results_df = results_df[final_columns]
    
    # Save to CSV
    results_df.to_csv(output_path, index=False)
    print(f"\n✅ Comprehensive results saved to: {output_path}")
    
    return results_df

# Run the analysis for change-proneness
nativedata = pd.read_csv("buggy_smelly/combined/change LR data combined.csv")
result_mixed, X_mixed, projects = run_mixed_effects_analysis_change(nativedata, target='Changed')

print("=== MIXED EFFECTS MODEL RESULTS FOR CHANGE-PRONENESS ===")
print("(With project clustering, design smells only - no additional controls)")
print(result_mixed.summary())

# Extract and display key results
coef_df = pd.DataFrame({
    'Coefficient': result_mixed.params,
    'Odds_Ratio': np.exp(result_mixed.params),
    'P_Value': result_mixed.pvalues,
    'CI_Lower': np.exp(result_mixed.conf_int()[0]),
    'CI_Upper': np.exp(result_mixed.conf_int()[1])
})

print("\n=== KEY DESIGN SMELL PREDICTORS (excluding project dummies) ===")
key_predictors = coef_df.loc[~coef_df.index.str.startswith('project_') & 
                            ~coef_df.index.str.startswith('const')]
print(key_predictors.sort_values('P_Value'))

# Save results to CSV
output_path = "buggy_smelly/combined/combined_change_proneness_LR_mixed_effects_results.csv"
saved_results = save_results_to_csv(result_mixed, output_path)

# Also save a simplified version with only design smells (excluding project dummies)
print("\n=== SAVING SIMPLIFIED RESULTS (Design Smells Only) ===")
design_smells_results = saved_results[~saved_results['Variable'].str.startswith('project_') & 
                                     ~saved_results['Variable'].str.startswith('const')]
design_smells_output_path = "buggy_smelly/combined/combined_change_proneness_design_smells_only.csv"
design_smells_results.to_csv(design_smells_output_path, index=False)
print(f"✅ Design smells results saved to: {design_smells_output_path}")

# Print summary statistics
print("\n=== MODEL SUMMARY FOR CHANGE-PRONENESS ===")
print(f"Number of observations: {result_mixed.nobs}")
print(f"Log-Likelihood: {result_mixed.llf:.2f}")
print(f"Pseudo R-squared: {result_mixed.pseudo_rsquared():.4f}")
print(f"Significant design smells (p < 0.05): {sum(design_smells_results['Significant_0.05'])}")
print(f"Significant design smells (p < 0.01): {sum(design_smells_results['Significant_0.01'])}")

# Display the most significant design smells
print("\n=== TOP 10 MOST SIGNIFICANT DESIGN SMELLS FOR CHANGE-PRONENESS ===")
top_smells = design_smells_results.nsmallest(10, 'P_Value')[['Variable', 'Coefficient', 'Odds_Ratio', 'P_Value', 'Significance']]
print(top_smells.to_string(index=False))

# Additional: Save model summary as text
def save_model_summary(result, output_path):
    """
    Save the raw model summary as text file
    """
    summary_text = result.summary().as_text()
    
    with open(output_path, 'w') as f:
        f.write(summary_text)
    
    print(f"✅ Model summary saved to: {output_path}")

# Save raw model summary
summary_output_path = "buggy_smelly/combined/change_proneness_model_summary.txt"
save_model_summary(result_mixed, summary_output_path)

In [None]:
# LR for pooled analysis with control variables for fault-analysis
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def run_mixed_effects_analysis_fixed(nativedata, target='Changed'):
    """
    Fixed version - ensures all data is numeric
    """
    df = nativedata.copy()
    
    # Basic features (smells + controls)
    features = [
        'ASM', 'EILC', 'HCL', 'LRA', 'MMM', 'NCOE', 'NHE', 'NSL', 'NURP', 
        'PEO', 'TMC', 'TMS', 'UNMD', 'UNMI', 'UP', 'loc', 'previous_fixes', 'code_churn'
    ]
    
    # Add project identifier
    df['project_name'] = df['Project'].str.split('-').str[0]
    
    # Filter and prepare data
    X = df[features + ['project_name']].dropna()
    y = df.loc[X.index, target]
    
    print(f"Total observations: {len(X)}")
    print(f"Projects: {X['project_name'].unique()}")
    
    # Since statsmodels doesn't have built-in mixed logit, we'll use fixed effects with project dummies
    # and cluster standard errors by project
    X_fixed = X.drop('project_name', axis=1)
    
    # Add project fixed effects - ensure they're numeric
    project_dummies = pd.get_dummies(X['project_name'], prefix='project', drop_first=True)
    
    # Convert all dummies to int (they might be bool)
    project_dummies = project_dummies.astype(int)
    
    X_fixed = pd.concat([X_fixed, project_dummies], axis=1)
    
    # Convert ALL columns to numeric to be safe
    for col in X_fixed.columns:
        X_fixed[col] = pd.to_numeric(X_fixed[col], errors='coerce')
    
    # Drop any new NaN values created during conversion
    X_fixed = X_fixed.dropna()
    
    # Align y with X_fixed after dropping NaN
    y = y.loc[X_fixed.index]
    
    # Standardize continuous features (excluding dummies)
    continuous_features = ['loc', 'previous_fixes', 'code_churn']
    scaler = StandardScaler()
    X_fixed[continuous_features] = scaler.fit_transform(X_fixed[continuous_features])
    
    X_fixed = sm.add_constant(X_fixed)
    
    print(f"Final X shape: {X_fixed.shape}")
    print(f"Final X dtypes: {X_fixed.dtypes.unique()}")
    
    # Fit model with clustered standard errors by project
    model = sm.GLM(y, X_fixed, family=sm.families.Binomial())
    result = model.fit(cov_type='cluster', cov_kwds={'groups': X.loc[X_fixed.index, 'project_name']}, maxiter=100)
    
    return result, X_fixed, X.loc[X_fixed.index, 'project_name']

def save_results_to_csv(result, output_path):
    """
    Save comprehensive regression results to CSV
    """
    # Create comprehensive results dataframe
    results_df = pd.DataFrame({
        'Variable': result.params.index,
        'Coefficient': result.params.values,
        'Std_Error': result.bse.values,
        'z_Value': result.tvalues.values,
        'P_Value': result.pvalues.values,
        'Odds_Ratio': np.exp(result.params.values),
        'CI_Lower': np.exp(result.conf_int()[0].values),
        'CI_Upper': np.exp(result.conf_int()[1].values),
        'Significant_0.05': result.pvalues.values < 0.05,
        'Significant_0.01': result.pvalues.values < 0.01
    })
    
    # Add significance stars
    def add_significance_stars(p_value):
        if p_value < 0.001:
            return '***'
        elif p_value < 0.01:
            return '**'
        elif p_value < 0.05:
            return '*'
        else:
            return ''
    
    results_df['Significance'] = results_df['P_Value'].apply(add_significance_stars)
    
    # Format confidence interval as string
    results_df['95%_CI'] = results_df.apply(
        lambda row: f"[{row['CI_Lower']:.4f}, {row['CI_Upper']:.4f}]", axis=1
    )
    
    # Reorder columns for better readability
    final_columns = [
        'Variable', 'Coefficient', 'Std_Error', 'z_Value', 'P_Value', 
        'Significance', 'Odds_Ratio', '95%_CI', 'CI_Lower', 'CI_Upper',
        'Significant_0.05', 'Significant_0.01'
    ]
    
    results_df = results_df[final_columns]
    
    # Save to CSV
    results_df.to_csv(output_path, index=False)
    print(f"\n✅ Comprehensive results saved to: {output_path}")
    
    return results_df

# Run the analysis
nativedata = pd.read_csv("buggy_smelly/combined/fault LR data combined.csv")
result_mixed, X_mixed, projects = run_mixed_effects_analysis_fixed(nativedata, target='Changed')

print("=== MIXED EFFECTS MODEL RESULTS (with project clustering) ===")
print(result_mixed.summary())

# Extract and display key results
coef_df = pd.DataFrame({
    'Coefficient': result_mixed.params,
    'Odds_Ratio': np.exp(result_mixed.params),
    'P_Value': result_mixed.pvalues,
    'CI_Lower': np.exp(result_mixed.conf_int()[0]),
    'CI_Upper': np.exp(result_mixed.conf_int()[1])
})

print("\n=== KEY PREDICTORS (excluding project dummies) ===")
key_predictors = coef_df.loc[~coef_df.index.str.startswith('project_') & 
                            ~coef_df.index.str.startswith('const')]
print(key_predictors.sort_values('P_Value'))

# Save results to CSV
output_path = "buggy_smelly/combined/combined_fault_LR_mixed_effects_results.csv"
saved_results = save_results_to_csv(result_mixed, output_path)

# Also save a simplified version with only key predictors (excluding project dummies)
print("\n=== SAVING SIMPLIFIED RESULTS (Key Predictors Only) ===")
key_results = saved_results[~saved_results['Variable'].str.startswith('project_') & 
                           ~saved_results['Variable'].str.startswith('const')]
key_output_path = "buggy_smelly/combined/combined_fault_LR_key_predictors.csv"
key_results.to_csv(key_output_path, index=False)
print(f"✅ Key predictors saved to: {key_output_path}")

# Print summary statistics
print("\n=== MODEL SUMMARY ===")
print(f"Number of observations: {result_mixed.nobs}")
print(f"Log-Likelihood: {result_mixed.llf:.2f}")
print(f"Pseudo R-squared: {result_mixed.pseudo_rsquared():.4f}")
print(f"Significant predictors (p < 0.05): {sum(saved_results['Significant_0.05'])}")
print(f"Significant predictors (p < 0.01): {sum(saved_results['Significant_0.01'])}")

# Display the most significant results
print("\n=== TOP 10 MOST SIGNIFICANT PREDICTORS ===")
top_predictors = saved_results.nsmallest(10, 'P_Value')[['Variable', 'Coefficient', 'Odds_Ratio', 'P_Value', 'Significance']]
print(top_predictors.to_string(index=False))