In [1]:
from pymongo import MongoClient
import pandas as pd

In [2]:
# MongoDB connection details
mongo_uri = "mongodb://localhost:27017/"
database_name = "final-first-research"
initial_projects_collection_name = "initial-projects"
issues_related_projects_collection_name = "issues-related-projects"
performance_refactorings_collection_name = "performance-refactorings"


# Connect to MongoDB
client = MongoClient(mongo_uri)
db = client[database_name]
initial_projects_collection = db[initial_projects_collection_name]
issues_related_projects_collection = db[issues_related_projects_collection_name]
peformance_refactorings_collection = db[performance_refactorings_collection_name]


# Get projects from database
initial_projects = list(initial_projects_collection.find())
issues_related_projects = list(issues_related_projects_collection.find())
performance_refactorings = list(peformance_refactorings_collection.find({}))



In [3]:
# Create dataframes
initial_projects_df = pd.DataFrame(initial_projects)
issues_related_projects_df = pd.DataFrame(issues_related_projects)
performance_refactorings_df = pd.DataFrame(performance_refactorings)


In [4]:
# Stars
stars_stats = initial_projects_df["stargazers_count"].describe().rename("Stars")

# Closed issues
closed_issues_stats = (
    initial_projects_df["closed_issues_count"].describe().rename("Closed Issues")
)

# Open issues
open_issues_stats = (
    initial_projects_df["open_issues_count"].describe().rename("Open Issues")
)

# Sizes
sizes_stats = initial_projects_df["size"].describe().rename("Sizes")

In [8]:
# Combine all statistics into a single DataFrame
combined_stats_df = pd.DataFrame(
    {
        "Stars": stars_stats,
        "Closed Issues": closed_issues_stats,
        "Open Issues": open_issues_stats,
        "Sizes": sizes_stats,
    }
)

# Format sizes stats to five decimal places
combined_stats_df["Sizes"] = combined_stats_df["Sizes"].apply(lambda x: f"{x:.5f}")


# Save to CSV
combined_stats_df.to_csv("../data/project_statistics.csv", index=True)

In [9]:
initial_projects_count = len(initial_projects_df)
issues_related_projects_count = len(issues_related_projects_df)
refactorings_related_projects_count = len(
    performance_refactorings_df["repo_fullname"].unique()
)

print(f"Total number of projects: {initial_projects_count}")
print(f"Total number of issues-related projects: {issues_related_projects_count}")
print(f"Total number of refactorings-related projects: {refactorings_related_projects_count}")

Total number of projects: 31614
Total number of issues-related projects: 1684
Total number of refactorings-related projects: 255
