In [8]:
import pandas as pd
from packaging import version
import numpy as np

In [None]:
# Carrega arquivo parquet
df = pd.read_parquet("DB/dependencies-1.6.0-2020-01-12.parquet")

In [7]:
df

Unnamed: 0,ID,Platform,Project Name,Project ID,Version Number,Version ID,Dependency Name,Dependency Platform,Dependency Kind,Optional Dependency,Dependency Requirements,Dependency Project ID
0,8737,Dub,a4g,197,0.1.0,2,a4g:client,Dub,runtime,false,*,
1,8738,Dub,a4g,197,0.1.0,2,a4g:server,Dub,runtime,false,*,
2,8743,Dub,ae,199,1.0.0,6,openssl,Dub,runtime,false,>=1.1.3+1.0.1g,493.0
3,8744,Dub,ae,199,1.0.1,7,openssl,Dub,runtime,true,>=1.1.3+1.0.1g,493.0
4,8752,Dub,anchovy,203,0.2.4,21,anchovy:core,Dub,runtime,false,~master,
...,...,...,...,...,...,...,...,...,...,...,...,...
190388557,192302198,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,lodash-es,NPM,runtime,false,^4.17.15,167122.0
190388558,192302197,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,jh-utils,NPM,runtime,false,^0.0.1-beta.2,4790697.0
190388559,192302196,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,classnames,NPM,runtime,false,^2.2.6,135853.0
190388560,192302195,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,antd,NPM,runtime,false,^3.26.3,844273.0


In [3]:
df[df["Project Name"] == "git"]

Unnamed: 0,ID,Platform,Project Name,Project ID,Version Number,Version ID,Dependency Name,Dependency Platform,Dependency Kind,Optional Dependency,Dependency Requirements,Dependency Project ID
41381,36173197,Pub,git,29674,0.1.0,9648,hop,Pub,Development,false,>=0.28.1+2,29849.0
41382,36173198,Pub,git,29674,0.1.0,9648,unittest,Pub,Development,false,>=0.10.0 <0.11.0,30534.0
41383,36173199,Pub,git,29674,0.1.0,9648,bot_test,Pub,Development,false,>=0.1.5,29305.0
41384,36173195,Pub,git,29674,0.1.0,9648,path,Pub,runtime,false,>=1.0.0 <2.0.0,30147.0
41385,36173194,Pub,git,29674,0.1.0,9648,bot_io,Pub,runtime,false,>=0.25.1 <0.26.0,29304.0
...,...,...,...,...,...,...,...,...,...,...,...,...
182682695,158868595,Conda,git,4439306,2.20.1,21817954,perl,Conda,runtime,false,5.*,4439967.0
182682696,158868594,Conda,git,4439306,2.20.1,21817954,openssl,Conda,runtime,false,">=1.1.1a,<1.1.2a",4439892.0
182682697,158868593,Conda,git,4439306,2.20.1,21817954,libgcc-ng,Conda,runtime,false,>=7.3.0,4439528.0
182682698,158868592,Conda,git,4439306,2.20.1,21817954,libcurl,Conda,runtime,false,">=7.63.0,<8.0a0",4439514.0


In [9]:
def parse_version_safe(version_str):
    """Converte string de versão para objeto Version do packaging"""
    try:
        # Remove prefixos comuns e limpa a string
        clean_version = str(version_str).strip().lstrip('v')
        # Remove sufixos como '+build' ou '-alpha'
        clean_version = clean_version.split('+')[0].split('-')[0]
        return version.Version(clean_version)
    except:
        # Para versões inválidas, retorna uma versão muito baixa
        return version.Version("0.0.0")

# Aplica parsing de versão de forma vetorizada
print("Parsing versions...")
df['version_obj'] = df['Version Number'].apply(parse_version_safe)

# Agrupa por Project Name E Platform para encontrar versão máxima
# (importante porque o mesmo projeto pode existir em plataformas diferentes)
print("Finding latest versions...")
latest_versions = df.groupby(['Project Name', 'Platform'])['version_obj'].transform('max')

# Filtra mantendo apenas as versões mais recentes
print("Filtering dataframe...")
df_latest = df[df['version_obj'] == latest_versions].copy()

# Remove coluna auxiliar para economizar memória
df_latest.drop('version_obj', axis=1, inplace=True)

print(f"DataFrame original: {len(df):,} linhas")
print(f"DataFrame filtrado: {len(df_latest):,} linhas")
print(f"Redução: {((len(df) - len(df_latest)) / len(df) * 100):.1f}%")

# Verifica o resultado para o projeto git
print("\nExemplo - versões do git restantes:")
git_versions = df_latest[df_latest["Project Name"] == "git"]["Version Number"].unique()
print(f"Git versions: {sorted(git_versions)}")

Parsing versions...
Finding latest versions...
Filtering dataframe...
DataFrame original: 190,388,562 linhas
DataFrame filtrado: 19,557,940 linhas
Redução: 89.7%

Exemplo - versões do git restantes:
Git versions: ['0.1.5', '0.2.2', '1.0.1', '1.2.2', '1.5.0', '2.17.0', '2.20.1']


In [17]:
df_filtrado_no_index = pd.read_parquet("DB/dependencies-1.6.0-2020-01-12-latest_noindex.parquet")
df_filtrado = pd.read_parquet("DB/dependencies-1.6.0-2020-01-12-latest.parquet")

In [18]:
df_filtrado

Unnamed: 0,ID,Platform,Project Name,Project ID,Version Number,Version ID,Dependency Name,Dependency Platform,Dependency Kind,Optional Dependency,Dependency Requirements,Dependency Project ID
0,8737,Dub,a4g,197,0.1.0,2,a4g:client,Dub,runtime,false,*,
1,8738,Dub,a4g,197,0.1.0,2,a4g:server,Dub,runtime,false,*,
3,8744,Dub,ae,199,1.0.1,7,openssl,Dub,runtime,true,>=1.1.3+1.0.1g,493.0
70,13295773,Dub,anchovy,203,0.8.3,4520756,derelict-ft,Dub,runtime,false,1.0.2,288.0
71,13295774,Dub,anchovy,203,0.8.3,4520756,dlib,Dub,runtime,false,0.7.1,346.0
...,...,...,...,...,...,...,...,...,...,...,...,...
190388557,192302198,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,lodash-es,NPM,runtime,false,^4.17.15,167122.0
190388558,192302197,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,jh-utils,NPM,runtime,false,^0.0.1-beta.2,4790697.0
190388559,192302196,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,classnames,NPM,runtime,false,^2.2.6,135853.0
190388560,192302195,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,antd,NPM,runtime,false,^3.26.3,844273.0


In [19]:
df_filtrado_no_index

Unnamed: 0,ID,Platform,Project Name,Project ID,Version Number,Version ID,Dependency Name,Dependency Platform,Dependency Kind,Optional Dependency,Dependency Requirements,Dependency Project ID
0,8737,Dub,a4g,197,0.1.0,2,a4g:client,Dub,runtime,false,*,
1,8738,Dub,a4g,197,0.1.0,2,a4g:server,Dub,runtime,false,*,
2,8744,Dub,ae,199,1.0.1,7,openssl,Dub,runtime,true,>=1.1.3+1.0.1g,493.0
3,13295773,Dub,anchovy,203,0.8.3,4520756,derelict-ft,Dub,runtime,false,1.0.2,288.0
4,13295774,Dub,anchovy,203,0.8.3,4520756,dlib,Dub,runtime,false,0.7.1,346.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19557935,192302198,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,lodash-es,NPM,runtime,false,^4.17.15,167122.0
19557936,192302197,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,jh-utils,NPM,runtime,false,^0.0.1-beta.2,4790697.0
19557937,192302196,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,classnames,NPM,runtime,false,^2.2.6,135853.0
19557938,192302195,NPM,jh-lib,4790798,0.0.1-beta.1,24293147,antd,NPM,runtime,false,^3.26.3,844273.0


In [15]:
df_latest.to_parquet("DB/dependencies-1.6.0-2020-01-12-latest_noindex.parquet", index=False)
df_latest.to_parquet("DB/dependencies-1.6.0-2020-01-12-latest.parquet")