From 7f64f023d134bffe53429e72e785bfeba94b6cba Mon Sep 17 00:00:00 2001 From: Kris Beicher <112945740+K-Beicher@users.noreply.github.com> Date: Thu, 6 Mar 2025 13:55:09 +0100 Subject: [PATCH] feat: add scripts to fetch and process data --- poetry.lock | 124 +++++++++++------- pyproject.toml | 2 + scripts/convert-meta.py | 188 +++++++++++++++++++++++++++ scripts/convert-samples.py | 170 +++++++++++++++++++++++++ scripts/download-data.py | 255 +++++++++++++++++++++++++++++++++++++ 5 files changed, 692 insertions(+), 47 deletions(-) create mode 100644 scripts/convert-meta.py create mode 100644 scripts/convert-samples.py create mode 100644 scripts/download-data.py diff --git a/poetry.lock b/poetry.lock index 7578301..fdedad7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -319,6 +319,28 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "fastexcel" +version = "0.13.0" +description = "A fast excel file reader for Python, written in Rust" +optional = false +python-versions = ">=3.9" +files = [ + {file = "fastexcel-0.13.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:709f44440312ecd909856f9335d8224c1f3d57ea50a136ec5f88fab86fc153d7"}, + {file = "fastexcel-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:40978c1aae4888db105332caac8bef9492953ffe8fd4f195e306d8a11464e5a6"}, + {file = "fastexcel-0.13.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eeff6ab300b8affd9b59acc55af53c72f53c7d8cd28d2cd7071dd046d15dd34c"}, + {file = "fastexcel-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d92bc6e8caa6685f2901b2efe173a5cc6e975b62bfc555844373415aee5a255a"}, + {file = "fastexcel-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:10297f6c8146691e9d0e6b22bc1b47bae49a522a8edd3150f19b4d5d3eef2a01"}, + {file = "fastexcel-0.13.0.tar.gz", hash = "sha256:308112093a73fb5a1f1e1619df55b72c882ef26234de2e2b8bd76d1781c335e1"}, +] + +[package.dependencies] +pyarrow = ">=8.0.0" + +[package.extras] +pandas = ["pandas (>=1.4.4)"] +polars = ["polars (>=0.16.14)"] + [[package]] name = "fastparquet" version = "2024.11.0" @@ -612,17 +634,6 @@ files = [ {file = "ijson-3.3.0.tar.gz", hash = "sha256:7f172e6ba1bee0d4c8f8ebd639577bfe429dee0f3f96775a067b8bae4492d8a0"}, ] -[[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -optional = false -python-versions = ">=3.7" -files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, -] - [[package]] name = "isodate" version = "0.7.2" @@ -1121,21 +1132,6 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] -[[package]] -name = "pluggy" -version = "1.5.0" -description = "plugin and hook calling mechanisms for python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, - {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, -] - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - [[package]] name = "polars" version = "1.24.0" @@ -1179,6 +1175,60 @@ timezone = ["tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] xlsxwriter = ["xlsxwriter"] +[[package]] +name = "pyarrow" +version = "19.0.1" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69"}, + {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec"}, + {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89"}, + {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a"}, + {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a"}, + {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608"}, + {file = "pyarrow-19.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866"}, + {file = "pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90"}, + {file = "pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00"}, + {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae"}, + {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5"}, + {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3"}, + {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6"}, + {file = "pyarrow-19.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466"}, + {file = "pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b"}, + {file = "pyarrow-19.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294"}, + {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14"}, + {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34"}, + {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6"}, + {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832"}, + {file = "pyarrow-19.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960"}, + {file = "pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c"}, + {file = "pyarrow-19.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae"}, + {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4"}, + {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2"}, + {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6"}, + {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136"}, + {file = "pyarrow-19.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef"}, + {file = "pyarrow-19.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0"}, + {file = "pyarrow-19.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9"}, + {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3"}, + {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6"}, + {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a"}, + {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8"}, + {file = "pyarrow-19.0.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b9766a47a9cb56fefe95cb27f535038b5a195707a08bf61b180e642324963b46"}, + {file = "pyarrow-19.0.1-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:6c5941c1aac89a6c2f2b16cd64fe76bcdb94b2b1e99ca6459de4e6f07638d755"}, + {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8"}, + {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:335d170e050bcc7da867a1ed8ffb8b44c57aaa6e0843b156a501298657b1e972"}, + {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:1c7556165bd38cf0cd992df2636f8bcdd2d4b26916c6b7e646101aff3c16f76f"}, + {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:699799f9c80bebcf1da0983ba86d7f289c5a2a5c04b945e2f2bcf7e874a91911"}, + {file = "pyarrow-19.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429"}, + {file = "pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e"}, +] + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pydantic" version = "2.10.6" @@ -1352,26 +1402,6 @@ engineering = ["unyt"] spark = ["pyspark"] test = ["hypothesis (>=4.4.0)", "interrogate", "pandas-vet", "polars", "py (>=1.10.0)", "pytest (>=3.4.2)", "pytest-cov", "pytest-xdist"] -[[package]] -name = "pytest" -version = "8.3.5" -description = "pytest: simple powerful testing with Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, - {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=1.5,<2" - -[package.extras] -dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] - [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -2044,4 +2074,4 @@ docs = ["Sphinx", "elementpath (>=4.4.0,<5.0.0)", "jinja2", "sphinx-rtd-theme"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "e0a6ec98d0ae5e0b527c1115f243b741c2184b8f20b14c196ed4f914565627ef" +content-hash = "012e3e32e40fecd537318715c06c627697764ac0f1120d98ca8f375ee1c3781a" diff --git a/pyproject.toml b/pyproject.toml index 75b7227..f77b3a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ python = "^3.12" polars = "^1.24.0" pyjanitor = "^0.30.0" seedcase-sprout = {git = "https://github.com/seedcase-project/seedcase-sprout.git"} +requests = "^2.32.3" +fastexcel = "^0.13.0" [tool.poetry.group.dev.dependencies] ruff = "^0.6.2" diff --git a/scripts/convert-meta.py b/scripts/convert-meta.py new file mode 100644 index 0000000..d928e58 --- /dev/null +++ b/scripts/convert-meta.py @@ -0,0 +1,188 @@ +from pathlib import Path + +import janitor.polars +import polars as pl + +# Set the folder path for raw-data +resource_dir = Path(__file__).resolve().parent.parent +folder_path = resource_dir / "data-raw" +unzip_path = folder_path / "downloaded" + +# Create the infant metadata files +# FIO All infants in meta_infant_ur are included in meta_infant_bl. +# There is one extra infant in meta_infant_bl. +df_meta_infant_bl = pl.read_csv(unzip_path / "metadata_infant_blood.csv") +df_meta_infant_ur = pl.read_csv(unzip_path / "metadata_infant_urine.csv") + +df_infant = ( + df_meta_infant_bl.select( + ["Infant_ID", "Mother_ID", "GD_Delivery", "Fostered", "Foster_ID"] + ) + .rename({"GD_Delivery": "gestation_day_delivery"}) + .unique() + .clean_names() + .write_csv(folder_path / "data_infant_meta.csv", separator=";") +) +# infant gender is in data_adult at present, all subjects are male, +# females are not included in the study + +df_infant_w1 = ( + df_meta_infant_bl.select(["Infant_ID", "Weight_PD7", "ActualDay_PD7"]) + .rename({"Weight_PD7": "weight_at_x_days_old", "ActualDay_PD7": "x_days_old"}) + .clean_names() +) +df_infant_w2 = ( + df_meta_infant_ur.select(["Infant_ID", "Infant_weight", "PD"]) + .rename({"Infant_weight": "weight_at_x_days_old", "PD": "x_days_old"}) + .clean_names() +) +df_infant_weight = ( + pl.concat([df_infant_w1, df_infant_w2]) + .unique() + .clean_names() + .write_csv(folder_path / "data_infant_weight.csv", separator=";") +) # Concatenate (long) + +# Create the infant linking tables + +matter1 = "Blood" +matter2 = "Urine" + +df_link_infant1 = df_meta_infant_bl.select( + ["Infant_ID", "Exp", "PD", "Batch"] +).with_columns(pl.lit(matter1).alias("type_of_matter")) +df_link_infant2 = df_meta_infant_ur.select( + ["Infant_ID", "Exp", "PD", "Batch"] +).with_columns(pl.lit(matter2).alias("type_of_matter")) + +df_infant_link = ( + pl.concat([df_link_infant1, df_link_infant2]) # Concatenate long + .rename({"PD": "day_sample_taken", "Exp": "infant_sample_id"}) + .clean_names() + .write_csv(folder_path / "data_infant_sample_meta.csv", separator=";") +) + +# Create the adult metadata files +# FIO There are no additional Mother_ID in file 11 +df_metadata_maternal_bl = pl.read_csv(unzip_path / "metadata_maternal_blood.csv") +df_metadata_maternal_ur = pl.read_csv(unzip_path / "metadata_maternal_urine.csv") +df_metadata_maternal_pl = pl.read_csv(unzip_path / "metadata_maternal_placenta.csv") + +# Adult metadata file +df_adult_meta1 = df_metadata_maternal_bl.select( + [ + "Mother_ID", + "Mother_age", + "GD_Delivery", + "Group", + "Mode_birth", + "Reject", + "Infant_sex", + ] +) +df_adult_meta2 = df_metadata_maternal_ur.select( + [ + "Mother_ID", + "Mother_age", + "GD_Delivery", + "Group", + "Mode_birth", + "Reject", + "Infant_sex", + ] +) + +df_adult = ( + pl.concat([df_adult_meta1, df_adult_meta2]) # Concatenate long + .rename( + { + "Mother_age": "Age_at_conception", + "GD_Delivery": "gestation_day_at_delivery", + "Group": "obesity_classification", + "Mode_birth": "mode_of_birth", + } + ) + .clean_names() + .unique() + .write_csv(folder_path / "data_adult_meta.csv", separator=";") +) +# Adult weight file +df_adult_weight1 = df_metadata_maternal_bl.select( + ["Mother_ID", "GD_day", "GD_targeted", "Mother_Weight", "BCS"] +).rename( + { + "GD_day": "sample_gestation_day", + "GD_targeted": "target_gestation_day", + "Mother_Weight": "weight_at_gestation_day", + "BCS": "body_condition_score", + } +) +df_adult_weight2 = df_metadata_maternal_ur.select( + ["Mother_ID", "GD", "Target_GD", "Mother_Weight", "BCS"] +).rename( + { + "GD": "sample_gestation_day", + "Target_GD": "target_gestation_day", + "Mother_Weight": "weight_at_gestation_day", + "BCS": "body_condition_score", + } +) +df_adult_weight = ( + pl.concat([df_adult_weight1, df_adult_weight2]) + .unique() + .write_csv(folder_path / "data_adult_weight.csv", separator=";") +) # Concatenate long + +# Create the adult linking tables + +# matter1 = "Blood" define above in infant linking tables +# matter2 = "Urine" define above in infant linking tables +matter3 = "Placenta" + +df_link_adult1 = ( + df_metadata_maternal_bl.select( + ["Mother_ID", "Exp", "GD_day", "GD_targeted", "Batch", "Dilution_factor"] + ) + .rename({"GD_day": "GD", "GD_targeted": "Target_GD"}) + .with_columns(pl.lit(matter1).alias("type_of_matter")) +) +df_link_adult2 = df_metadata_maternal_ur.select( + ["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"] +).with_columns(pl.lit(matter2).alias("type_of_matter")) +df_link_adult3 = df_metadata_maternal_pl.select( + ["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"] +).with_columns(pl.lit(matter3).alias("type_of_matter")) + +df_adult_link = ( + pl.concat([df_link_adult1, df_link_adult2, df_link_adult3]) # Concatenate long + .rename( + { + "GD": "day_sample_taken", + "Target_GD": "target_sampling_day", + "Exp": "adult_sample_id", + } + ) + .write_csv(folder_path / "data_adult_sample_meta.csv", separator=";") +) + +# Create the placenta file +# Both from metadata_maternal_bl (Placenta_Width,Placenta_Height, +# Placenta_Thickness,EPV) and from 11 (see below) +# More than one measure in metadata_maternal_bl? +""" +11 - Metadata_Maternal_placenta.csv +* Variables +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.placenta.csv file. +o Mother_ID: IDs of mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o GD: Exact gestational day (GD) when samples were collected. +o Target_GD: Target GD for sample collection. +o Dilution_factor: Dilution factor used to prepare NMR samples. +o BCS: Body Condition Score (BCS) +o Tissue_weight: Weight of placental tissue sample. +o V1: Volume of solvent used to extract (uL). Used to correct the metabolite concentration. +o V2: Volume of polar layer (methanol + water) collected (uL). Used to correct the metabolite concentration. +o V3: Buffer added to reconstitute the sample after freeze drying (uL). Used to correct the metabolite concentration. +* Missing data codes: Indicated by NAs. +""" diff --git a/scripts/convert-samples.py b/scripts/convert-samples.py new file mode 100644 index 0000000..b6717d1 --- /dev/null +++ b/scripts/convert-samples.py @@ -0,0 +1,170 @@ +from pathlib import Path + +import janitor.polars +import polars as pl +import polars.selectors as cs + +# Set the folder path for raw-data +resource_dir = Path(__file__).resolve().parent.parent +folder_path = resource_dir / "data-raw" +unzip_path = folder_path / "downloaded" + +# Create the metabolite data files + + +# Transpose a single file +def transpose_data(load_file_name: Path, save_file_name: Path, monkey_type: str): + """Reads a CSV file, transposes it, and writes the result to another CSV.""" + pl.read_csv(unzip_path / load_file_name).transpose( + include_header=True, header_name=monkey_type, column_names="Metabolite" + ).write_csv(folder_path / save_file_name) + + +# Find files and generate corresponding output filenames +load_file_names = list(unzip_path.glob("concentration_*.csv")) +save_file_names = [folder_path / f"data_{file.name}" for file in load_file_names] + +# Extract 'maternal' or 'infant' from the filenames +monkey_types = [file.stem.split("_")[1] + "_sample_id" for file in load_file_names] + +list(map(transpose_data, load_file_names, save_file_names, monkey_types)) + +# Cortisol data +df_cortisol_infant_bl = pl.read_excel(unzip_path / "cortisol_infant_blood.xlsx") + +df_cortisol = df_cortisol_infant_bl.unpivot( + ["samp1", "samp2", "samp3", "samp4"], index=["Infant_ID", "PD"] +) + +time_mapping = {"samp1": "02:00", "samp2": "07:00", "samp3": "11:30", "samp4": "23:30"} + +df_cortisol = ( + df_cortisol.with_columns( + pl.col("variable").replace_strict(time_mapping).alias("hours_into_sampling") + ) + .select(["Infant_ID", "PD", "hours_into_sampling", "value"]) + .rename( + { + "value": "cortisol_level", + "Infant_ID": "infant_id", + "PD": "post_gestation_day", + } + ) + .clean_names() + .write_csv(folder_path / "data_cortisol_infant_blood.csv") +) +# Cytokine data both sets +df_cytokine_infant_bl = pl.read_csv(unzip_path / "cytokine_infant_blood.csv") +df_cytokine_infant_bl = ( + df_cytokine_infant_bl.drop( + [ + "Group", + "PD", + "Target_PD", + "Mother_ID", + "GD_Delivery", + "Mode_birth", + "Fostered", + "Foster_ID", + ] + ) + .clean_names() + .write_csv(folder_path / "data_cytokine_infant_bl.csv", separator=";") +) + +df_cytokine_maternal_bl = pl.read_csv(unzip_path / "cytokine_maternal_blood.csv") + +df_cytokine_maternal_bl = ( + df_cytokine_maternal_bl.drop(["Group", "GD", "Target_GD"]) + .clean_names() + .write_csv(folder_path / "data_cytokine_maternal_blood.csv", separator=";") +) +# May want to have a closer look at 14 at some point + +# Human intruder testing +df_infant_behavior = pl.read_excel(unzip_path / "hi_infant_behavior.xlsx") + +profile_mapping = { + "pfscratch": "Profile-Far", + "pnscratch": "Profile-Near", + "sfscratch": "Stare-Far", + "snscratch": "Stare-Near", +} + +df_behavior = df_infant_behavior.unpivot( + ["pfscratch", "pnscratch", "sfscratch", "snscratch"], index=["Infant_ID", "PD"] +) + +df_behavior = ( + df_behavior.with_columns( + pl.col("variable") + .replace_strict(profile_mapping) + .alias("presentation_of_profile") + ) + .select(["Infant_ID", "PD", "presentation_of_profile", "value"]) + .rename( + {"value": "scratches", "Infant_ID": "infant_id", "PD": "post_gestation_day"} + ) + .clean_names() + .write_csv(folder_path / "data_hi_infant_behavior.csv") +) + +# Cognition testing of infants +df_infant_cognitive = pl.read_excel(unzip_path / "vpc_infant_cognitive.xlsx") + +df_cognitive = df_infant_cognitive.unpivot( + [ + "P1T1 Total number Look RIGHT FAM", + "P1T1 Total number Look LEFT NOVEL", + "P1T1 Total number Look Away", + "P1T2 Total number Look RIGHT NOVEL", + "P1T2 Total number Look LEFT FAM", + "P1T2 Total number Look Away", + "P2T1 Total number Look RIGHT NOVEL", + "P2T1 Total number Look LEFT FAM", + "P2T1 Total number Look Away", + "P2T2 Total number Look RIGHT FAM", + "P2T2 Total number Look LEFT NOVEL", + "P2T2 Total number Look Away", + "P3T1 Total number Look RIGHT NOVEL", + "P3T1 Total number Look LEFT FAM", + "P3T1 Total number Look Away", + "P3T2 Total number Look RIGHT FAM", + "P3T2 Total number Look LEFT NOVEL", + "P3T2 Total number Look Away", + "P4T1 Total number Look RIGHT FAM", + "P4T1 Total number Look LEFT NOVEL", + "P4T1 Total number Look Away", + "P4T2 Total number Look RIGHT NOVEL", + "P4T2 Total number Look LEFT FAM", + "P4T2 Total number Look Away", + "no.look_N", + "no.look_F", + "no.look_N+F", + "no.look_N/N+F", + ], + index=["Infant_ID", "GD_delivery", "Batch", "PCD"], +) + +df_cognitive = ( + df_cognitive.with_columns(pl.col("variable")) + .select(["Infant_ID", "GD_delivery", "Batch", "PCD", "variable", "value"]) + .rename( + { + "value": "Looks", + "variable": "type_of_test", + "Infant_ID": "infant_id", + "GD_delivery": "gestation_day_at_delivery", + "PCD": "post_conception_day", + } + ) + .clean_names() + .write_csv(folder_path / "data_vpc_infant_cognitive.csv") +) + +# TO DO +# 9 Meta maternal blood +# Batch,Dilution_factor +# 10 Meta maternal urine +# Batch,Dilution_factor +# Check if there are more weight measures to be found in 15 diff --git a/scripts/download-data.py b/scripts/download-data.py new file mode 100644 index 0000000..8905155 --- /dev/null +++ b/scripts/download-data.py @@ -0,0 +1,255 @@ +"""Downloads a data file from a given URL. + +This is the most basic of the download data scripts. It requires that the url for +the data is provided as an argument, and it assumes that a data-raw folder +has been created containing a .gitignore file. +""" + +import os +from pathlib import Path +from zipfile import ZipFile + +import requests + +resource_dir = Path(__file__).resolve().parent.parent +folder_path = resource_dir / "data-raw" +unzip_path = folder_path / "downloaded" + +# Download and save the zip file +all_files = requests.get("https://zenodo.org/api/records/7055715/files-archive") + +all_files_path = unzip_path / "all_files.zip" +with open(all_files_path, "wb") as file: + file.write(all_files.content) + +# Extract the zip file +with ZipFile(all_files_path, "r") as zip_ref: + zip_ref.extractall(unzip_path) + +# Rename the files to snake_case +for file in unzip_path.iterdir(): + if file.name.endswith((".csv", ".xlsx")): + new_name = file.name.lower().replace(".", "_", 1) + file.rename(unzip_path / new_name) + +""" +1 - Concentration_Infant.blood.csv +* Columns: Samples. The same “Exp” represent the same sample in Metadata_Infant.blood.csv & Cytokine_Infant.blood.csv files. +* Rows: Metabolites +* The unit is uM. + +2 - Concentration_Infant.urine.csv +* Columns: Samples. The same “Exp” represent the same sample in Metadata_Infant.urine.csv file. +* Rows: Metabolites +* The unit is uM. + +3 - Concentration_Maternal.blood.csv +* Columns: Sample IDs. The same “Exp” represent the same sample in Concentration_Maternal.blood.csv, Cytokine_Infant.blood.csv, and Metadata_Maternal.blood.csv files. +* Rows: Metabolites +* The unit is uM. + +4 - Concentration_Maternal.placenta.csv +* Columns: Samples. The same “Exp” represent the same sample in Metadata_Maternal.placenta.csv file. +* Rows: Metabolites +* The unit is uM. + +5 - Concentration_Maternal.urine.csv +* Columns: Samples. The same “Exp” represent the same sample in Metadata_Maternal.urine.csv file. +* Rows: Metabolites +* The unit is uM. + +6 - Cortisol_Infant.blood.csv +* Variables +o Infant_ID: IDs of infants. +o Group: Lean or Obese. +o PD: Exact postnatal day (PD) when samples were collected. +o Mother_ID: IDs of mothers. +o Foster_ID: IDs of foster mothers. +o Infant_weight: Infant weights (kg) recorded at samples collection. +o samp1: Cortisol level from the 1st blood samples collected from infants at 11am, after being separated from mothers at 9am. +o samp2: Cortisol level from the 2nd blood samples collected 4 pm. After the blood collection, 500 ?g/kg dexamethasone was injected intramuscularly. +o samp3: Cortisol level from the 3rd blood samples collected at 8:30 am of the following day. After the blood collection, 2.5 IU of adrenocorticotropic hormone (ACTH) was injected intramuscularly. +o samp4: Cortisol level from the 4th blood samples collected 30 min after ACTH injection. +o pctsuppression: Values obtained by dividing samp3 by samp2. +* Missing data codes: Indicated by NAs. + +7 - Metadata_Infant.blood.csv +* Variables +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Infant.blood.csv & Cytokine_Infant.blood.csv files. +o Infant_ID: IDs of infants. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o PD: Exact postnatal day (PD) when samples were collected. +o Target_PD: Target PD for sample collection. +o Dilution_factor: Dilution factor used to prepare NMR samples. +o Mother_ID: IDs of mothers. +o GD_Delivery: Gestational day at delivery. +o Mode_birth: Mode of delivery. +o Fostered: Yes (fostered) or No (was not fostered). +o Foster_ID: IDs of foster mothers. +o Weight_PD7: Infant weights recorded at around PD7. +o ActualDay_PD7: Actual day for “Weight_PD7”. +o Infant_weight: Infant weights (kg) recorded at samples collection. +* Missing data codes: Indicated by NAs. + +8 - Metadata_Infant.urine.csv +* Variables +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Infant.urine.csv file. +o Infant_ID: IDs of infants. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o PD: Exact postnatal day (PD) when samples were collected. +o Target_PD: Target PD for sample collection. +o Dilution_factor: Dilution factor used to prepare NMR samples. +o Mother_ID: IDs of mothers. +o GD_Delivery: Gestational day at delivery. +o Mode_birth: Mode of delivery. +o Fostered: Yes (fostered) or No (was not fostered). +o Foster_ID: IDs of foster mothers. +o Infant_weight: Infant weights (kg) recorded at samples collection. +* Missing data codes: Indicated by NAs. + +9 - Metadata_Maternal.blood.csv +* Variables +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.blood.csv, Cytokine_Infant.blood.csv, and Metadata_Maternal.blood.csv files. +o Mother_ID: IDs of mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o GD: Exact gestational day (GD) when samples were collected. +o Target_GD: Target GD for sample collection. +o Dilution_factor: Dilution factor used to prepare NMR samples. +o BCS: Body Condition Score (BCS) +o Mother_Weight: Maternal weight in kg at sample collection. +o Mother_age: Maternal age at conception. +o GD_Delivery: Gestational day at delivery. +o Infant_sex: Infant sex. +o Mode_birth: Mode of delivery. +o Reject: Whether mothers rejected infants or not. +o Placenta_Width: Width of placenta at sample collection. +o Placenta_Height: Height of placenta at sample collection. +o Placenta_Thickness: Thickness of placenta at sample collection. +o EPV: Estimated placental volume. +* Missing data codes: Indicated by NAs. + +10 - Metadata_Maternal.urine.csv +* Variables +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.urine.csv file. +o Mother_ID: IDs of mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o GD: Exact gestational day (GD) when samples were collected. +o Target_GD: Target GD for sample collection. +o Dilution_factor: Dilution factor used to prepare NMR samples. +o BCS: Body Condition Score (BCS) +o Mother_Weight: Maternal weight in kg at sample collection. +o Mother_age: Maternal age at conception. +o GD_Delivery: Gestational day at delivery. +o Infant_sex: Infant sex. +o Mode_birth: Mode of delivery. +* Missing data codes: Indicated by NAs. + +11 - Metadata_Maternal.placenta.csv +* Variables +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.placenta.csv file. +o Mother_ID: IDs of mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o GD: Exact gestational day (GD) when samples were collected. +o Target_GD: Target GD for sample collection. +o Dilution_factor: Dilution factor used to prepare NMR samples. +o BCS: Body Condition Score (BCS) +o Tissue_weight: Weight of placental tissue sample. +o V1: Volume of solvent used to extract (uL). Used to correct the metabolite concentration. +o V2: Volume of polar layer (methanol + water) collected (uL). Used to correct the metabolite concentration. +o V3: Buffer added to reconstitute the sample after freeze drying (uL). Used to correct the metabolite concentration. +* Missing data codes: Indicated by NAs. + +12 - Cytokine_Infant.blood +* Variables: +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Infant.blood.csv & Metadata_Infant.blood.csv files. +o Infant_ID: IDs of infants. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o PD: Exact postnatal day (PD) when samples were collected. +o Target_PD: Target PD for sample collection. +o Mother_ID: IDs of mothers. +o GD_Delivery: Gestational day at delivery. +o Mode_birth: Mode of delivery. +o Fostered: Yes (fostered) or No (was not fostered). +o Foster_ID: IDs of foster mothers. +o Weight_PD7: Infant weights recorded at around PD7. +o hsCPP, GM_CSF, IFN_g, IL_1b, IL_ra, IL_2, IL_4, IL_5, IL_6, IL_8, IL_10, IL_12.23_p40, IL_13, IL_15, +IL_17a, MCP_1, MIP_1b, sCD40L_38, TGFa, TNFa, VEGF, C_Peptide, GIP, Inflammatory markers. +o Insulin: Insulin level (pg/mL). +* Missing data codes: Indicated by NAs. +* Specialized formats or other abbreviations used: GD, gestational day; BCS, Body Condition Score; +GM-CSF, granulocyte-macrophage colony-stimulating factor; IFN- ?, interferon ?; TNF-?, tumor necrosis factor-?; +TGF-?, transforming growth factor-?; MCP-1, monocyte chemoattractant protein-1; MIP-1?, macrophage inflammatory protein-1?; +hs-CRP, high-sensitivity C-reactive protein; IL, interleukin. + +13 - Cytokine_Maternal.blood +* Variables: +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.blood.csv, Cytokine_Infant.blood.csv, and Metadata_Maternal.blood.csv files. +o Mother_ID: IDs of mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o GD: Exact gestational day (GD) when samples were collected. +o Target_GD: Target GD for sample collection. +o BCS: Body Condition Score (BCS) +o hsCPP, GM_CSF, IFN_g, IL_1b, IL_ra, IL_2, IL_6, IL_8, IL_10, IL_12/23_p40, IL_13, IL_15, +IL_17a, MCP_1, MIP_1b, sCD40L, TGFa, VEGF, C_Peptide, GIP, PP_53, PYY_54: Inflammatory markers. +o Insulin: Insulin level (uU/mL) +* Missing data codes: Indicated by NAs. +* Specialized formats or other abbreviations used: GD, gestational day; BCS, Body Condition Score; +hs-CRP, high-sensitivity C-reactive protein; GM_CSF, granulocyte-macrophage colony-stimulating factor; +IFN- ?, interferon-?; TNF-?, tumor necrosis factor-?; TGF-?, transforming growth factor-?; +MCP-1, monocyte chemoattractant protein-1; MIP-1?, macrophage inflammatory protein-1?; IL, interleukin; +IL-1ra, IL-1 receptor antagonist. + +14 - WB.infant.brain +* Variables +o Infant_ID: IDs of infants. +o Brain_region: Amygdala, Hippocampus, Hypothalamus, P.Cortex (= prefrontal cortex) +o Group: Lean or Obese. +o Akt, p.Akt, AMPK, p.AMPK, S6K, p.S6K: Normalized relative intensity levels. + +15 - HI_infant.behavior +* Variables +o Infant_ID: IDs of infants. +o Group: Lean or Obese. +o PD: Exact postnatal day (PD) when samples were collected. +o Mother_ID: IDs of mothers. +o Foster_ID: IDs of foster mothers. +o Infant_weight: Infant weights (kg) recorded at samples collection. +o pfscratch: Profile-Far (technician presented the left profile from ~1 m away from an infant in a cage) +o pnscratch: Profile-Near (presented left profile from ~0.3 m) +o sfscratch: Stare-Far (made direct eye contact with the animal from far) +o snscratch: Stare-Near (direct eye contact from near position) +* Missing data codes: Indicated by NAs. + +16 - VPC_infant_cognitive +* Variables +o Infant_ID: IDs of infants. +o Mother_ID: IDs of mothers. +o Foster_ID: IDs of foster mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o GD_Delivery: Gestational day at delivery. +o PCD: post-conception day. +o PxTx Total number Look RIGHT/LEFT FAM: Total number of looks at a familiar object that was placed right or left side. P stands for the number of problem (1 to 4) and T stands for the number of Trial (1 or 2). +o PxTx Total number Look RIGHT/LEFT NOVEL: Total number of looks at a novel object that was placed right or left side. P stands for the number of problem (1 to 4) and T stands for the number of Trial (1 or 2). +o PxTx Total number Look Away: Total number of looks away from either of the object. P stands for the number of problem (1 to 4) and T stands for the number of Trial (1 or 2). +o no.looks_N: Total number of looks at novel object throughout the problems and trials. +o no.looks_F: Total number of looks at familiar object throughout the problems and trials. +o no.looks_N+F: Total number of looks at novel and familiar object throughout the problems and trials. +o no.looks_N/N+F: Novelty preference calculated as: number of fixations at the novel stimulus (no.looks_N)/number of fixations at both the novel and familiar stimulus (no.looks_N+F). + +17 - Gestational.weight.gain.rate +* Variables +o Mother_ID: IDs of mothers. +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) +o Group: Lean or Obese. +o Infant_ID: IDs of infants. +o Foster_ID: IDs of foster mothers. +o GWG: Gestational weight gain rate. +"""