# Overview

At the time of writing, 1262 projects have been identified worldwide. Of these:
- 1188 projects are hosted on GitHub
- 27 on GitLab, and
- 125 on other platforms or self-hosted websites.

We found 996 active project repositories in total. A project is considered active if the public repository has at least one commit or closed issue within the last year. We have excluded inactive projects from our analysis as their inclusion would distort current trends. This step is important, as many open source software projects are no longer actively maintained. In the table below, the inactive open source projects are those that have become inactive since data collection began two years ago:

In [1]:
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from opensustain_template import *

In [2]:
df_raw = pd.read_csv("../csv/projects.csv")
df_raw.rename(columns={"rubric": "topic"},inplace=True)
df_raw.rename(columns={"topics": "labels"},inplace=True)

In [3]:
# Age plots are better in years
df_raw["project_age_in_years"] = df_raw["project_age_in_days"].apply(lambda x: x / 365)
max_age_in_years = 8.0

In [4]:
fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Dimension", "Value"],line_color='#000000',
                        fill_color='#ffffff', font_size=18 ,  ),
            cells=dict(
                        fill_color='#ffffff',
                        line_color='#ffffff',
                        font_size=16,
                        height=30,
                values=[
                    [
                        "Total number of projects",
                        "Github projects",
                        "Gitlab projects",
                        "Other platforms",
                        "Number of projects in personal namespace",
                        "Number of projects in community namespace",
                        "Total stars of all projects",
                        "Total contributors in all projects",
                        "Active GitHub projects",
                        "Inactive GitHub projects",
                        "Projects with contribution guide in %",
                        "Projects with code of conduct in %",
                        "Projects accepting donations in %",
                        "Median number of commits",
                        "Median stargazers",
                        "Median stars last year",
                        "Median Development Distribution Score",
                        "Median number of contributors",
                        "Median closed issues last year",
                        "Median commits last year",
                        "Median age in years",
                    ],
                    [
                        df_raw["project_name"].count(),
                        df_raw["platform"].value_counts()["github"],
                        df_raw["platform"].value_counts()["gitlab"],
                        df_raw["platform"].value_counts()["custom"],
                        df_raw["project_name"].count() - df_raw["organization"].count(),
                        df_raw["organization"].count(),
                        df_raw["stargazers_count"].sum(),
                        df_raw["contributors"].sum(),
                        df_raw["project_active"].value_counts()[True],
                        df_raw["project_active"].value_counts()[False],
                        round(df_raw["contribution_guide"].value_counts(normalize=True)[True]*100,2),
                        round(df_raw["code_of_conduct"].value_counts(normalize=True)[True]*100,2),
                        round(df_raw["accepts_donations"].value_counts(normalize=True)[True]*100,2),
                        df_raw["total_number_of_commits"].median(),
                        df_raw["stargazers_count"].median(),
                        df_raw["stars_last_year"].median(),
                        round(df_raw["development_distribution_score"].median(),4),
                        df_raw["contributors"].median(),
                        df_raw["issues_closed_last_year"].median(),
                        df_raw["total_commits_last_year"].median(),
                        round(df_raw["project_age_in_years"].median(),2),
                        
                    ],
                ]
            ),
        )
    ]
)


fig['layout'].update(margin=dict(l=5,r=0,b=0,t=5))
fig.update_layout(
height=700,
width=750
)
fig.show()

```{figure} data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
:figclass: caption-hack
:name: statistics-all-projects

Statistics on all projects
```

In [5]:
df_active = df_raw.copy()
# Filter out the inactive project for further analysis
df_active = df_active[(df_active["project_active"] == True)]
# Ciruated Lists are no classical open source projects and are not included into the analysis
df_active = df_active[(df_active["topic"] != "Curated Lists")]
# Filter out the projects not on the GitHub platform
df_active = df_active[(df_active["platform"] == "github")]
df_active['project_name'] = df_active['project_name'].replace({'A Global Inventory of Commerical-, Industrial-, and Utility-Scale Photovoltaic Solar Generating Units': 'A Global Inventory of Photovoltaic'})
df_active['project_name'] = df_active['project_name'].replace({'Asset-level Transition Risk in the Global Coal, Oil, and Gas Supply Chains': 'Global Coal, Oil, and Gas Supply Chains'})

def text_to_link(project_name,git_url):
    return "<a href=\""+git_url+"\">"+project_name+"</a>"
    
df_active["project_name"] = df_active.apply(lambda x: text_to_link(x.project_name, x.git_url),axis=1)

In [6]:
## Hack Sector content into dataset

def topic_to_sector(topic):
    if topic in ("Photovoltaics and Solar Energy","Wind Energy","Hydro Energy","Geothermal Energy", "Bioenergy"):
        sector = "Renewable Energy"
    elif topic in ("Battery","Hydrogen"):
        sector = "Energy Storage"
    elif topic in ("Energy Modeling and Optimization","Energy Monitoring and Control","Energy Distribution and Grids","Datasets on Energy Systems"):
        sector = "Energy Systems"
    elif topic in ("Buildings and Heating","Mobility and Transportation","Production and Industry","Computation and Communication"):
        sector = "Consumption of Energy and Resources"
    elif topic in ("Carbon Intensity and Accounting","Carbon Capture and Removal","Emission Observation and Modeling"):
        sector = "Emissions"
    elif topic in ("Life Cycle Assessment","Circular Economy and Waste"):
        sector = "Industrial Ecology"
    elif topic in ("Biosphere","Cryosphere","Hydrosphere","Atmosphere"):
        sector = "Earth Systems"
    elif topic in ("Earth and Climate Modeling","Radiative Transfer","Meteorological Observation and Forecast","Climate Data Processing and Access","Integrated Assessment"):
        sector = "Climate and Earth Science"
    elif topic in ("Air Quality","Water Supply and Quality","Soil and Land","Agriculture and Nutrition","Natural Hazard and Poverty"):
        sector = "Natural Resources"
    elif topic in ("Sustainable Development Goals","Sustainable Investment","Knowledge Platforms","Data Catalogs and Interfaces","Curated Lists"):
        sector = "Sustainable Development"
    else:
        print(topic)
        raise ValueError('Topic not within sectors')
    return sector
df_active["sector"] = df_active['topic'].apply(topic_to_sector)

In [7]:
# Calculate the scores on activity, community and size
df_active["activity"] = (
    df_active["total_commits_last_year"].rank(pct=True)
    + df_active["issues_closed_last_year"].rank(pct=True)
    + df_active["days_until_last_issue_closed"].rank(pct=True)
    + df_active["last_released_date"].rank(pct=True, na_option="top")
)

df_active["community"] = (
    df_active["contributors"].rank(pct=True)
    + df_active["development_distribution_score"].rank(pct=True)
    + df_active["reviews_per_pr"].rank(pct=True)
)

df_active["size"] = (
    df_active["total_number_of_commits"].rank(pct=True)
    + df_active["contributors"].rank(pct=True)
    + df_active["closed_issues"].rank(pct=True)
    + df_active["closed_pullrequests"].rank(pct=True)
)

# All scores are weighted equal and normalized to one
df_active["total_score"] = (
    df_active["activity"] / df_active["activity"].max()
    + df_active["community"] / df_active["community"].max()
    + df_active["size"] / df_active["size"].max()
) / 3

The following scatter plot provides an overview of all projects studied that are less than ten years old. The size of the circles is proportional to the relative scale of the projects, based on total commits and contributions. The colour bar shows the Development Distribution Score (DDS) as a measure of the distribution of work among the individual developers. A high value indicates a high distribution of work and, thus, a strong developer community. More details about this can be found in chapter {ref}`dds_chapter`.

In [1]:
max_age_in_years=10
fig = px.scatter(
    df_active.query("project_age_in_years<@max_age_in_years"),
    x="project_age_in_years",
    y="topic",
    size="size",
    color="development_distribution_score",
    custom_data=["project_name","oneliner","git_url"],
    size_max=20,
)

fig.update_layout(
    coloraxis_colorbar=dict(
        title="DDS",
    ),
    yaxis_title=None,
    xaxis_title="Project Age in Years",
    height=1000,  # Added parameter
    width=1300,
    title="Overview of all Projects",
    hoverlabel=dict(
    bgcolor="white"
)
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Project Name: <b>%{customdata[0]}</b>",
        "Project Info: <b>%{customdata[1]}</b>",
        "Git URL: <b>%{customdata[2]}</b>"
    ])
)
fig.add_layout_image(
    dict(
        source=logo_img,
        xref="paper", yref="paper",
        x=1, y=1,
        sizex=0.05, sizey=0.05,
        xanchor="right", yanchor="top"
    )
)
fig.show()

NameError: name 'px' is not defined

```{figure} data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
:figclass: caption-hack
:name: overview-all-projects

Overview of all projects in various topics in the last 10 years
```

In [9]:
# Save the dataset with the scores
df_active_path = "../csv/project_analysis.csv"
df_active.to_csv(df_active_path)