In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

In [2]:
# MongoDB connection details
mongo_uri = "mongodb://localhost:27017/"
database_name = "research"
perfromance_issues_collection_name = "performance-issues"
performance_refactorings_collection_name = "performance-refactorings"
non_performance_refactorings_collection_name = "non-performance-refactorings"

# Connect to MongoDB
client = MongoClient(mongo_uri)
db = client[database_name]
performance_issues_collection = db[perfromance_issues_collection_name]
performance_refactorings_collection = db[performance_refactorings_collection_name]
non_performance_refactorings_collection = db[
    non_performance_refactorings_collection_name
]

In [3]:
# Load performance and non-performance refactorings data
performance_refactorings = pd.DataFrame(
    list(performance_refactorings_collection.find({}, {"type": 1}))
)
non_performance_refactorings = pd.DataFrame(
    list(non_performance_refactorings_collection.find({"type": {"$ne": None}}, {"type":1}))
)

In [4]:
# Get frequencies of refactoring  types in both performance and non-performance refactorings
performance_refactoring_counts = performance_refactorings["type"].value_counts()
non_performance_refactoring_counts = non_performance_refactorings["type"].value_counts()

print("Performance Refactoring Types:", performance_refactoring_counts) 
print("Non-Performance Refactoring Types:", non_performance_refactoring_counts)

Performance Refactoring Types: Change Variable Type          1807
Add Parameter                 1215
Move Class                     929
Change Parameter Type          880
Rename Method                  875
                              ... 
Merge Catch                      2
Merge Class                      2
Replace Attribute                2
Modify Variable Annotation       1
Parameterize Test                1
Name: type, Length: 96, dtype: int64
Non-Performance Refactoring Types: Change Variable Type          654350
Add Method Annotation         496478
Change Parameter Type         496108
Change Return Type            400498
Add Parameter                 382370
                               ...  
Replace Attribute                571
Split Variable                   550
Merge Method                     506
Parameterize Test                 99
Modify Variable Annotation        95
Name: type, Length: 102, dtype: int64


In [5]:
# Convert the series to DataFrames
performance_df = performance_refactoring_counts.reset_index()
performance_df.columns = ["Refactoring_Type", "Performance_Freq"]

non_performance_df = non_performance_refactoring_counts.reset_index()
non_performance_df.columns = ["Refactoring_Type", "Non_Performance_Freq"]

# Merge the DataFrames on 'Refactoring_Type'
merged_df = pd.merge(
    performance_df, non_performance_df, on="Refactoring_Type", how="outer"
)

# Fill NaN values with 0 and convert to integer
merged_df.fillna(0, inplace=True)
merged_df["Performance_Freq"] = merged_df["Performance_Freq"].astype(int)
merged_df["Non_Performance_Freq"] = merged_df["Non_Performance_Freq"].astype(int)

# Display the resulting DataFrame
print(merged_df)

          Refactoring_Type  Performance_Freq  Non_Performance_Freq
0     Change Variable Type              1807                654350
1            Add Parameter              1215                382370
2               Move Class               929                320860
3    Change Parameter Type               880                496108
4            Rename Method               875                366831
..                     ...               ...                   ...
97      Try With Resources                 0                  8375
98            Move Package                 0                  5634
99          Rename Package                 0                  5013
100          Split Package                 0                  1493
101          Merge Package                 0                  1182

[102 rows x 3 columns]


In [6]:
# # Step 2: Perform Chi-square Goodness-of-Fit Test
# # We'll compare the observed frequencies for each refactoring type between performance and non-performance refactorings.

# # Initialize an empty list to store chi-square test results
# chi2_results = []
# cramers_v_results = []

# for i, row in merged_df.iterrows():
#     # Create the contingency table
#     contingency_table = [
#         [row["Performance_Freq"], row["Non_Performance_Freq"]],
#         [
#             merged_df["Performance_Freq"].sum() - row["Performance_Freq"],
#             merged_df["Non_Performance_Freq"].sum() - row["Non_Performance_Freq"],
#         ],
#     ]

#     # Perform the chi-square test
#     chi2, p, _, _ = chi2_contingency(contingency_table)
#     chi2_results.append((row["Refactoring_Type"], chi2, p))

#     # Calculate Cramér's V
#     n = np.sum(contingency_table)  # Total number of observations
#     min_dim = min(
#         len(contingency_table) - 1, len(contingency_table[0]) - 1
#     )  # min(k-1, r-1)
#     cramers_v = np.sqrt(chi2 / (n * min_dim))
#     cramers_v_results.append((row["Refactoring_Type"], cramers_v))

# # Convert the results to DataFrames
# chi2_df = pd.DataFrame(chi2_results, columns=["Refactoring_Type", "Chi2", "p-value"])
# chi2_df["p-value"] = chi2_df["p-value"].apply(
#     lambda x: "<0.001" if x < 0.001 else f"{x:.3f}"
# )


# cramers_v_df = pd.DataFrame(
#     cramers_v_results, columns=["Refactoring_Type", "Cramers_V"]
# )
# cramers_v_df["Cramers_V"] = cramers_v_df["Cramers_V"].apply(
#     lambda x: "<0.001" if x < 0.001 else f"{x:.3f}"
# )

# # Merge the Chi-Square and Cramér's V results
# results_df = pd.merge(chi2_df, cramers_v_df, on="Refactoring_Type")


# # Display the resulting DataFrame
# print(results_df)

In [7]:
# Step 2: Perform Chi-square Goodness-of-Fit Test and calculate Odds Ratios
# We'll compare the observed frequencies for each refactoring type between performance and non-performance refactorings.
# Initialize empty lists to store chi-square test results and odds ratios
chi2_results = []
odds_ratio_results = []

for i, row in merged_df.iterrows():
    # Create the contingency table
    contingency_table = [
        [row["Performance_Freq"], row["Non_Performance_Freq"]],
        [
            merged_df["Performance_Freq"].sum() - row["Performance_Freq"],
            merged_df["Non_Performance_Freq"].sum() - row["Non_Performance_Freq"],
        ],
    ]

    # Perform the chi-square test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    chi2_results.append((row["Refactoring_Type"], chi2, p))

    # Calculate Odds Ratio
    odds_ratio = (contingency_table[0][0] * contingency_table[1][1]) / (
        contingency_table[0][1] * contingency_table[1][0]
    )
    odds_ratio_results.append((row["Refactoring_Type"], odds_ratio))

# Convert the results to DataFrames
chi2_df = pd.DataFrame(chi2_results, columns=["Refactoring_Type", "Chi2", "p-value"])
chi2_df["p-value"] = chi2_df["p-value"].apply(
    lambda x: "<0.001" if x < 0.001 else f"{x:.3f}"
)

odds_ratio_df = pd.DataFrame(
    odds_ratio_results, columns=["Refactoring_Type", "Odds_Ratio"]
)
odds_ratio_df["Odds_Ratio"] = odds_ratio_df["Odds_Ratio"].apply(lambda x: f"{x:.3f}")

# Merge the Chi-Square and Odds Ratio results
results_df = pd.merge(chi2_df, odds_ratio_df, on="Refactoring_Type")

# Display the resulting DataFrame
print(results_df)

          Refactoring_Type        Chi2 p-value Odds_Ratio
0     Change Variable Type   44.504755  <0.001      1.179
1            Add Parameter  108.162448  <0.001      1.360
2               Move Class   37.787433  <0.001      1.230
3    Change Parameter Type   79.800730  <0.001      0.735
4            Rename Method    0.014117   0.905      1.005
..                     ...         ...     ...        ...
97      Try With Resources   18.916400  <0.001      0.000
98            Move Package   12.403166  <0.001      0.000
99          Rename Package   10.929063  <0.001      0.000
100          Split Package    2.614185   0.106      0.000
101          Merge Package    1.894134   0.169      0.000

[102 rows x 4 columns]


In [8]:
# Step 3: Calculate Average Percentages
merged_df["Performance_Percentage"] = (
    merged_df["Performance_Freq"] / merged_df["Performance_Freq"].sum() * 100
)
merged_df["Non_Performance_Percentage"] = (
    merged_df["Non_Performance_Freq"] / merged_df["Non_Performance_Freq"].sum() * 100
)

merged_df.head()

Unnamed: 0,Refactoring_Type,Performance_Freq,Non_Performance_Freq,Performance_Percentage,Non_Performance_Percentage
0,Change Variable Type,1807,654350,9.197801,7.908995
1,Add Parameter,1215,382370,6.184465,4.621628
2,Move Class,929,320860,4.728698,3.878169
3,Change Parameter Type,880,496108,4.479283,5.996356
4,Rename Method,875,366831,4.453833,4.433812


In [11]:
# Step 4: Calculate Cohen's d
mean_performance = merged_df["Performance_Freq"].mean()
mean_non_performance = merged_df["Non_Performance_Freq"].mean()
pooled_std = np.sqrt(
    ((merged_df["Performance_Freq"].std() ** 2) + (merged_df["Non_Performance_Freq"].std() ** 2))
    / 2
)
cohen_d = (mean_performance - mean_non_performance) / pooled_std

cohen_d, mean_performance, mean_non_performance, pooled_std

(-0.9293309905622881, 192.6078431372549, 81112.6568627451, 87073.44298358918)

In [12]:
# Merge merged_df, chi2_df, and odds_ratio_df on 'Refactoring_Type'
merged_chi2_df = pd.merge(merged_df, chi2_df, on="Refactoring_Type")
merged_results_df = pd.merge(merged_chi2_df, odds_ratio_df, on="Refactoring_Type")

# Save the results to a CSV file
merged_results_df.to_csv("../data/rq3_statistical_analysis_results.csv", index=False)

# Display the first few rows of the merged results
print(merged_results_df.head())

        Refactoring_Type  Performance_Freq  Non_Performance_Freq  \
0   Change Variable Type              1807                654350   
1          Add Parameter              1215                382370   
2             Move Class               929                320860   
3  Change Parameter Type               880                496108   
4          Rename Method               875                366831   

   Performance_Percentage  Non_Performance_Percentage        Chi2 p-value  \
0                9.197801                    7.908995   44.504755  <0.001   
1                6.184465                    4.621628  108.162448  <0.001   
2                4.728698                    3.878169   37.787433  <0.001   
3                4.479283                    5.996356   79.800730  <0.001   
4                4.453833                    4.433812    0.014117   0.905   

  Odds_Ratio  
0      1.179  
1      1.360  
2      1.230  
3      0.735  
4      1.005  


In [13]:
# # merge merged_df and chi2_df on 'Refactoring_Type'
# merged_chi2_df = pd.merge(merged_df, chi2_df, on="Refactoring_Type")
# merged_results_df = pd.merge(merged_chi2_df, cramers_v_df, on="Refactoring_Type")
# merged_results_df.to_csv("../data/rq3_statistical_analysis_results.csv")

# merged_results_df.head()