In [99]:
from pymongo import MongoClient
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

In [100]:
# MongoDB connection details
mongo_uri = "mongodb://localhost:27017/"
database_name = "final-first-research"
perfromance_issues_collection_name = "performance-issues"
performance_refactorings_collection_name = "performance-refactorings"
non_performance_refactorings_collection_name = "non-performance-refactorings"

# Connect to MongoDB
client = MongoClient(mongo_uri)
db = client[database_name]
performance_issues_collection = db[perfromance_issues_collection_name]
performance_refactorings_collection = db[performance_refactorings_collection_name]
non_performance_refactorings_collection = db[non_performance_refactorings_collection_name]

In [101]:
# Fetch performance refactorings
performance_refactorings = list(
    performance_refactorings_collection.find(
        {}, {"repo_fullname": 1, "commit_id": 1, "issue_number": 1, "pr_number": 1}
    )
)

# Extract relevant details from performance refactorings
performance_refactorings_commit_ids = {ref["commit_id"] for ref in performance_refactorings}


# Print the total number of performance refactoring commit IDs
print(
    f"Total number of performance refactoring commit IDs: {len(performance_refactorings_commit_ids)}"
)

# Fetch issues related to performance refactorings by matching repository full names, issue numbers, and PR numbers
query = {
    "commit_ids": {"$in": list(performance_refactorings_commit_ids)},
}
issues_related_to_performance_refactorings = list(
    performance_issues_collection.find(query)
)

# Print the total number of issues related to performance refactorings
print(
    f"Total number of issues related to performance refactorings: {len(issues_related_to_performance_refactorings)}"
)

# Extract commit IDs for each issue related to performance refactorings
commit_ids = []
for issue in issues_related_to_performance_refactorings:
    issue_commit_ids = issue.get("commit_ids", [])
    commit_ids.extend(issue_commit_ids)

commit_ids_set = set(commit_ids)

# Print the total number of commit IDs related to performance refactorings
print(
    f"Total number of commit IDs related to performance refactorings: {len(commit_ids_set)}"
)

# Count commit IDs found and not found in performance refactorings
in_perf_ref = len(commit_ids_set & performance_refactorings_commit_ids)
not_in_perf_ref = len(commit_ids_set - performance_refactorings_commit_ids)

print(f"Number of commit IDs with performance refactorings: {in_perf_ref}")
print(f"Number of commit IDs without performance refactorings: {not_in_perf_ref}")

# Additional debugging and validation steps

# Check for issues with empty or null commit_ids
empty_commit_ids_issues = [
    issue
    for issue in issues_related_to_performance_refactorings
    if not issue.get("commit_ids")
]
print(f"Number of issues with empty or null commit_ids: {len(empty_commit_ids_issues)}")



Total number of performance refactoring commit IDs: 2316
Total number of issues related to performance refactorings: 940
Total number of commit IDs related to performance refactorings: 5704
Number of commit IDs with performance refactorings: 2316
Number of commit IDs without performance refactorings: 3388
Number of issues with empty or null commit_ids: 0


In [102]:
# get non performance refactorings without type
non_performance_refactorings = list(non_performance_refactorings_collection.find({}, {'type': 1}))
df = pd.DataFrame(non_performance_refactorings)

In [103]:
column_name = "type"
total_count = df.shape[0]

commit_without_refactorings = df[df[column_name].isnull()].shape[0]
commit_with_refactorings = df[df[column_name].notnull()].shape[0]

print(
    f"Total number of non-performance refactorings: {total_count}"
)

print(
    f"Number of commits with non-performance refactorings: {commit_with_refactorings}"
)
print(
    f"Number of commits without non-performance refactorings: {commit_without_refactorings}"
)

Total number of non-performance refactorings: 9860235
Number of commits with non-performance refactorings: 8273491
Number of commits without non-performance refactorings: 1586744


In [110]:
data = {
    "Performance_related": [2316, 3388],  # [Refactoring Present, Refactoring Absent]
    "Non_performance_related": [8273491, 1586744],  # [Refactoring Present, Refactoring Absent]
}

# Create a DataFrame
df = pd.DataFrame(data, index=["Refactoring_Present", "Refactoring_Absent"])
df["Total"] = df.sum(axis=1)
df.loc["Total"] = df.sum()
df

print(df)

chi2, p, dof, expected = chi2_contingency(df.iloc[0:2, 0:2])

# Calculate the odds ratio
odds_ratio = (2316 / 3388) / (8273491 / 1586744)

log_p_value = (
    -np.log10(p) 
)  # Check for zero p-value


chi2, p, dof, expected, odds_ratio, log_p_value

                     Performance_related  Non_performance_related    Total
Refactoring_Present                 2316                  8273491  8275807
Refactoring_Absent                  3388                  1586744  1590132
Total                               5704                  9860235  9865939


  -np.log10(p)


(7904.110258919567,
 0.0,
 1,
 array([[4.78466400e+03, 8.27102234e+06],
        [9.19336003e+02, 1.58921266e+06]]),
 0.13110317801762714,
 inf)