In [1]:
import os
import pandas as pd

In [2]:
INPUT = os.path.join("..", "input", "merged.csv")
df = pd.read_csv(INPUT, delimiter=";")


In [3]:
# initial cleaning
df['usage_cnt'] = pd.to_numeric(df['usage_cnt'].replace(",", ""))
df['parsed_date'] = pd.to_datetime(df['parsed_date'])
df_orig = df
print(df_orig)
print(df_orig.groupby('dependency_name').count())

                                 dependency_name       version parsed_date  \
0                            org.slf4j.slf4j-api  2.1.0-alpha1  2024-01-02   
1                            org.slf4j.slf4j-api  2.1.0-alpha1  2024-01-02   
2                            org.slf4j.slf4j-api  2.1.0-alpha0  2023-12-28   
3                            org.slf4j.slf4j-api        2.0.16  2024-08-10   
4                            org.slf4j.slf4j-api        2.0.15  2024-08-08   
...                                          ...           ...         ...   
125625  com.esotericsoftware.yamlbeans.yamlbeans          1.12  2017-05-31   
125626  com.esotericsoftware.yamlbeans.yamlbeans          1.11  2017-01-02   
125627  com.esotericsoftware.yamlbeans.yamlbeans          1.09  2015-12-28   
125628  com.esotericsoftware.yamlbeans.yamlbeans          1.08  2015-12-28   
125629  com.esotericsoftware.yamlbeans.yamlbeans          1.06  2015-12-28   

        usage_cnt                                       downloa

In [4]:
# resolve relocations
df['num_relocations'] = 0
relocation_map = dict(zip(df['dependency_name'], df['relocation_name']))

def resolve_relocations(df):
    updated = False

    for index, row in df.iterrows():
        dep_name = row['dependency_name']
        reloc_name = row['relocation_name']

        if pd.notna(reloc_name) and reloc_name in relocation_map:
            new_dep_name = reloc_name
            new_reloc_name = relocation_map.get(new_dep_name, None)

            df.at[index, 'num_relocations'] += 1

            if new_dep_name != dep_name or new_reloc_name != reloc_name:
                df.at[index, 'dependency_name'] = new_dep_name
                df.at[index, 'relocation_name'] = new_reloc_name
                updated = True

    return updated


# Run recursion until no changes occur
while resolve_relocations(df):
    pass  # Keep looping until no more updates are made

In [5]:
df = df.sort_values(by=['num_relocations']).drop_duplicates(subset=['dependency_name', 'version'], keep='first')

In [6]:
# Find non-semantic version values
df['version'] = df['version'].astype(str)

semver_pattern = r'^\d+\.\d+(\.\d+){0,3}$'
release_tags = ['final', 'Final', 'FINAL', 'RELEASE', 'Release', 'release', 'GA', 'jdk', 'renjin', 'shipit']
for release_tag in release_tags:
    df['version'] = df['version'].str.replace("-" + release_tag,'')
    df['version'] = df['version'].str.replace("." + release_tag,'')

# turn milestone releases into subversion

alpha_tags = ['alpha', 'beta', 'b', 'B', 'A', 'a', 'rc', 'RC', 'Rc', 'M', 'm', 'android', 'pre', 
              'Preview', 'preview', 'PREVIEW', 'snapshot', 'SNAPSHOT', 'Snapshot', 'incubating', 'CR', 'cr', 'Cr', 'ALPHA', 'BETA',
             'sec', 'Sec', 'SEC', 'dev', 'Dev', 'DEV', 'SNSAPSHOT', 'android', 'graal', 'rev', 'groovyless', 'support', 'odps', 'FOR', 'for', 'SNAPHOT',
             'wolfc', 'pr', 'PFD', 'wso2v1', 'dse', 'sp', 'SP', 'jre6', 'jre7', 'FD', 'pd', 'rr', 'JS', 'next', 'git', 'EDR', 'full', 'wso2v3']
for alpha_tag in alpha_tags:
    df = df[~df.version.str.contains(alpha_tag)]

# now for guava get rid of the jre in the jre-version (we removed the android version)
df['version'] = df['version'].str.replace(".jre",'')

df['version'] = df['version'].str.replace('_', '.')
df['version'] = df['version'].str.replace('-', '.')

# airlift versioning fix, 0.123 , ... 200, 201, -> no semantic versioning, but is fine
io_airlift_packages = ['io.airlift.concurrent','io.airlift.configuration', 'io.airlift.json', 'io.airlift.log', 'io.airlift.log-manager']

# drop date pattern version numbers if they are not that common (< 20%) e.g., for org.json all versions use the date pattern, so keep it for that
df['is_date_version'] = df['version'].astype(str).str.contains(r'\d{8,}\.*\d*', regex=True)
date_version_ratio = df.groupby('dependency_name')['is_date_version'].mean()
dependencies_to_filter = date_version_ratio[date_version_ratio < 0.20].index
df = df[~((df['dependency_name'].isin(dependencies_to_filter)) & (df['is_date_version']))]
df = df.drop(columns=['is_date_version'])

# reformat version numbers that prepended a 'v':
df['version'] = df['version'].str.replace(r'^v(?=\d+)', '', regex=True)
# get rid of R used as "release" between two digits (e.g. 1.5R4)
df['version'] = df['version'].str.replace(r'(?<=\d)[rR](?=\d)', '.', regex=True)
df['version'] = df['version'].str.replace(r'(?<=\d)\.[rR](?=\d)', '', regex=True)
df['version'] = df['version'].str.replace(r'(?<=\d)[uU](?=\d)', '.', regex=True)
df['version'] = df['version'].str.replace(r'^[rR](\d+)$', r'0.\1', regex=True)

# finally replace anything coming after a semantic version number (e.g., some append the hash into the version ...)
df['version'] = df['version'].str.replace(r'^(\d+\.\d+(?:\.\d+)?(?:\.\d+)?).*$', r'\1', regex=True)

# print out remaining dependencies with non-conform version names, so we can investigate and deduce whether we need to replace
#print(df[~df['version'].astype(str).str.match(semver_pattern, na=False)].groupby('dependency_name').count().to_string())

In [7]:
df = df.drop_duplicates(subset=['version', 'dependency_name'])
print(df)
print(df.groupby('dependency_name').count())

                                dependency_name version parsed_date  \
82186   org.jenkins-ci.plugins.cloudbees-folder   6.0.2  2017-03-08   
82185   org.jenkins-ci.plugins.cloudbees-folder   6.0.3  2017-03-16   
82184   org.jenkins-ci.plugins.cloudbees-folder   6.0.4  2017-05-02   
82183   org.jenkins-ci.plugins.cloudbees-folder   6.1.0  2017-07-17   
82182   org.jenkins-ci.plugins.cloudbees-folder   6.1.1  2017-08-03   
...                                         ...     ...         ...   
124695                        xml-apis.xml-apis  1.3.03  2006-05-12   
124694                        xml-apis.xml-apis  1.3.04  2006-12-22   
124693                        xml-apis.xml-apis  1.4.01  2011-08-20   
124692                        xml-apis.xml-apis   2.0.0  2005-11-19   
124691                        xml-apis.xml-apis   2.0.2  2005-11-19   

        usage_cnt                                       download_url  \
82186       131.0  https://repo.jenkins-ci.org/releases/org/jenki...   
821

In [8]:
df = df.drop(columns=['num_relocations'])
df.to_csv("clean_versions.csv",encoding='utf-8',index=False)