# Path (dapla)

In [None]:
import os

import dapla as dp
import pandas as pd

os.chdir("../")
from fagfunksjoner import Path, PathSeries

In [None]:
folder = Path('ssb-prod-kart-data-delt/kartdata_analyse/klargjorte-data/2023')
folder

In [None]:
folder.ls()

In [None]:
(
    folder.ls().files()
    .containing("ABAS_kommune")
    .loc[lambda x: x.mb > 100]
)

## Fungerer som tekst

In [None]:
folder.startswith("ssb")

In [None]:
"ssb" in folder

In [None]:
dp.FileClient.get_gcs_file_system().exists(folder)

In [None]:
file = folder / "ABAS_kommune_utenhav_p2023_v1.parquet"

dp.read_pandas(file, columns=["KOMMUNENR"])

## Med metoder og attributter ala pathlib.Path

In [None]:
folder.exists()

In [None]:
folder.is_dir()

In [None]:
file = folder / "ABAS_kommune_utenhav_p2023_v1.parquet"
file

In [None]:
file.parent

## Og noen pandas attributter

Uten å lese filen

In [None]:
file.columns

In [None]:
file.dtypes

In [None]:
file.shape

## Versjonering

In [None]:
file.version_number

In [None]:
file.versions()

In [None]:
file.latest_version()

In [None]:
file.highest_numbered_version()

In [None]:
# highest_numbered_version + 1
file.new_version()

In [None]:
# alltid False
file.new_version().exists()

In [None]:
# finner/fjerner versjonsnummer med regex
file._version_pattern

## Periode

In [None]:
file.periods

In [None]:
file.with_periods("2023-01")

In [None]:
file.with_periods("2023-01-01")

In [None]:
file.with_periods("2023-01-01").periods

In [None]:
file.with_periods("2023", "2024")

In [None]:
file.with_periods("2023", "2024").periods

In [None]:
# finn riktig fil utfra stammen
year = 2023
stem = "ABAS_kommune_utenhav"

file = (
    (folder / stem)
    .with_periods(year)
    .highest_numbered_version()
)
file

In [None]:
# feilmelding hvis filen ikke finnes
year = 2023
stem = "ABAS_hav"

file = (
    (folder / stem)
    .with_periods(year)
    .highest_numbered_version()
)
file

In [None]:
# feilmelding hvis mer enn én fil matcher 'name'

year = 2023
stem = "ABAS_kommune"

file = (
    (folder / stem)
    .highest_numbered_version()
)
file

## Mer om ls

Stiene kopieres (som ctrl + c) når man klikker på stien.

In [None]:
files_in_dir = file.parent.ls()
files_in_dir

In [None]:
# subclass av pandas.Series
type(files_in_dir)

In [None]:
files_in_dir.loc[lambda x: x.gb > 10].keep_latest_versions()

In [None]:
# stiene er fortsatt Path
type(files_in_dir.iloc[0])

In [None]:
# velg ut filene
folder.ls().files().keep_highest_numbered_versions()

In [None]:
folder.parent.dirs()

In [None]:
# samme som .loc med x.str.contains
folder.ls().containing("kommune")

In [None]:
# og samme som
folder.glob("*kommune*")

In [None]:
folder.ls().within_days(100)

In [None]:
file.parent.parent.ls(recursive=True).files()

In [None]:
folders: PathSeries = file.parent.parent.ls()
folders.ls_dirs()

In [None]:
pd.concat(folders.ls_dirs())

## Write to testpath

In [None]:
testpath = Path(
    'ssb-prod-dapla-felles-data-delt/path_test/test_df_p2023.parquet'
)

# delete files first
for version in testpath.versions():
    version.rm_file()

testpath = testpath.new_version()

testpath.exists()

In [None]:
testpath

In [None]:
df = pd.DataFrame({"x": [1,2,3], "y": [*"abc"]})

dp.write_pandas(df, testpath)

testpath.exists()

In [None]:
testpath.latest_version()

In [None]:
# highest_numbered_version + 1
testpath.new_version(timeout=None)

In [None]:
# default med 30 minutter timeout for å ikke lagre i loop e.l.
for _ in range(5):
    dp.write_pandas(df, testpath.new_version())

In [None]:
for _ in range(5):
    dp.write_pandas(df, testpath.new_version(timeout=None))

In [None]:
dp.write_pandas(df, testpath.with_version(101))

In [None]:
testpath.versions()