In [22]:
import pymongo
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [23]:
# MongoDB connection setup
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["final-first-research"]
collection = db["performance-refactorings"]
issues_collection = db["performance-issues"]

# List of types to consider
types_to_include = [
    "Change Variable Type",
    "Add Parameter",
    "Move Class",
    "Change Parameter Type",
    "Rename Method",
    "Add Method Annotation",
    "Change Return Type",
    "Rename Variable",
    "Change Attribute Type",
    "Extract Method",
]

In [24]:
# Read the CSV file and extract the Id column
csv_file = "../../data/refactoring_reasons.csv"
try:
    csv_data = pd.read_csv(csv_file, encoding="latin1")
    omit_commit_ids = csv_data["commit_id"].tolist()
except (FileNotFoundError, pd.errors.EmptyDataError):
    csv_data = pd.DataFrame()
    omit_commit_ids = []


# Retrieve approximately 5 documents for each specified type from MongoDB excluding repo_name "bsl-language-server" and omit_ids
sampled_docs = []
for doc_type in types_to_include:
    pipeline = [
        {
            "$match": {
                "repo_name": {"$ne": "bsl-language-server"},
                "type": doc_type,
                "_id": {"$nin": omit_commit_ids},
            }
        },
        {"$sample": {"size": 5}},
    ]
    sampled_docs.extend(list(collection.aggregate(pipeline)))

In [25]:
# Function to flatten the data
def flatten_doc(doc, issue_data):
    flattened = {
        "_id": doc["_id"],
        "commit_id": doc["commit_id"],
        "repo_name": doc["repo_name"],
        "type": doc["type"],
        "issue_title": doc["issue_title"],
    }

    # Add issue data to the flattened document
    if issue_data:
        repo_url = issue_data.get('repo_url')
        flattened['issue_url'] = issue_data.get('issue_url')
        flattened['repo_url'] = repo_url if repo_url else ""
        flattened["commit_url"] = f"{repo_url}/commit/{doc['commit_id']}" if repo_url else ""

    # Flatten the first object in rightSideLocations
    if doc["rightSideLocations"]:
        first_right_side = doc["rightSideLocations"][0]
        for key, value in first_right_side.items():
            if key == "filePath":
                flattened[key] =f"{value}/{first_right_side['startLine']}-{first_right_side['endLine']}" 

    return flattened

In [26]:
# Query issues collection and apply the flattening function
flattened_docs = []
for doc in sampled_docs:
    issue_query = {
        "issue_number": doc["issue_number"],
        "issue_title": doc["issue_title"],
        "repo_fullname": doc["repo_fullname"],
    }
    issue_data = issues_collection.find_one(issue_query, {"issue_url": 1, "repo_url": 1})
    flattened_doc = flatten_doc(doc, issue_data)
    flattened_docs.append(flattened_doc)

# Convert the list of flattened documents to a pandas DataFrame
new_data_df = pd.DataFrame(flattened_docs)

# Update the fetched issues in the original CSV data
updated_csv_data = pd.concat([csv_data, new_data_df], ignore_index=True)

# Save the updated DataFrame to a new CSV file
updated_csv_data.to_csv(csv_file, index=False)

print(updated_csv_data.head())

                        _id                                 commit_id  \
0  665189cd4eb27c185fe55b7b  7b9efc2ee537d4417c9a59493ec79b3a66169cf1   
1  667715563cbfb03cd0957472  9c1b6ad8e5587391bb090cf33f3b3e8cdd55b878   
2  664e3d1b7b998c13cdb9ecbc  3a1260954b51e976834306c1b4163c527c88f877   
3  664e3d1f7b998c13cdb9ecd0  1e19dada64ee3dde665473ac8a4b33a9e1f8efb8   
4  664f70527b998c13cdbb7cad  176a3d215cd8adb855fb371e03e7d9aeee5118e2   

      repo_name                   type  \
0        Payara  Add Method Annotation   
1  vertx-ignite  Change Attribute Type   
2       quarkus     Change Return Type   
3       quarkus  Add Method Annotation   
4      querydsl  Change Attribute Type   

                                   issue_title  \
0      Upgrade to Hazelcast 3.5 for JCache fix   
1                   Ignite cache cannot expire   
2       Integrate Redis with the Quarkus Cache   
3       Integrate Redis with the Quarkus Cache   
4  querydsl-sql : Joda time should be optional   

       