From 7f64f023d134bffe53429e72e785bfeba94b6cba Mon Sep 17 00:00:00 2001
From: Kris Beicher <112945740+K-Beicher@users.noreply.github.com>
Date: Thu, 6 Mar 2025 13:55:09 +0100
Subject: [PATCH] feat: add scripts to fetch and process data

---
 poetry.lock                | 124 +++++++++++-------
 pyproject.toml             |   2 +
 scripts/convert-meta.py    | 188 +++++++++++++++++++++++++++
 scripts/convert-samples.py | 170 +++++++++++++++++++++++++
 scripts/download-data.py   | 255 +++++++++++++++++++++++++++++++++++++
 5 files changed, 692 insertions(+), 47 deletions(-)
 create mode 100644 scripts/convert-meta.py
 create mode 100644 scripts/convert-samples.py
 create mode 100644 scripts/download-data.py

diff --git a/poetry.lock b/poetry.lock
index 7578301..fdedad7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -319,6 +319,28 @@ files = [
     {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
 ]
 
+[[package]]
+name = "fastexcel"
+version = "0.13.0"
+description = "A fast excel file reader for Python, written in Rust"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "fastexcel-0.13.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:709f44440312ecd909856f9335d8224c1f3d57ea50a136ec5f88fab86fc153d7"},
+    {file = "fastexcel-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:40978c1aae4888db105332caac8bef9492953ffe8fd4f195e306d8a11464e5a6"},
+    {file = "fastexcel-0.13.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eeff6ab300b8affd9b59acc55af53c72f53c7d8cd28d2cd7071dd046d15dd34c"},
+    {file = "fastexcel-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d92bc6e8caa6685f2901b2efe173a5cc6e975b62bfc555844373415aee5a255a"},
+    {file = "fastexcel-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:10297f6c8146691e9d0e6b22bc1b47bae49a522a8edd3150f19b4d5d3eef2a01"},
+    {file = "fastexcel-0.13.0.tar.gz", hash = "sha256:308112093a73fb5a1f1e1619df55b72c882ef26234de2e2b8bd76d1781c335e1"},
+]
+
+[package.dependencies]
+pyarrow = ">=8.0.0"
+
+[package.extras]
+pandas = ["pandas (>=1.4.4)"]
+polars = ["polars (>=0.16.14)"]
+
 [[package]]
 name = "fastparquet"
 version = "2024.11.0"
@@ -612,17 +634,6 @@ files = [
     {file = "ijson-3.3.0.tar.gz", hash = "sha256:7f172e6ba1bee0d4c8f8ebd639577bfe429dee0f3f96775a067b8bae4492d8a0"},
 ]
 
-[[package]]
-name = "iniconfig"
-version = "2.0.0"
-description = "brain-dead simple config-ini parsing"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
-]
-
 [[package]]
 name = "isodate"
 version = "0.7.2"
@@ -1121,21 +1132,6 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
 type = ["mypy (>=1.11.2)"]
 
-[[package]]
-name = "pluggy"
-version = "1.5.0"
-description = "plugin and hook calling mechanisms for python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
-    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
-]
-
-[package.extras]
-dev = ["pre-commit", "tox"]
-testing = ["pytest", "pytest-benchmark"]
-
 [[package]]
 name = "polars"
 version = "1.24.0"
@@ -1179,6 +1175,60 @@ timezone = ["tzdata"]
 xlsx2csv = ["xlsx2csv (>=0.8.0)"]
 xlsxwriter = ["xlsxwriter"]
 
+[[package]]
+name = "pyarrow"
+version = "19.0.1"
+description = "Python library for Apache Arrow"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69"},
+    {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec"},
+    {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89"},
+    {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a"},
+    {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a"},
+    {file = "pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608"},
+    {file = "pyarrow-19.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866"},
+    {file = "pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90"},
+    {file = "pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00"},
+    {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae"},
+    {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5"},
+    {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3"},
+    {file = "pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6"},
+    {file = "pyarrow-19.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466"},
+    {file = "pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b"},
+    {file = "pyarrow-19.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294"},
+    {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14"},
+    {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34"},
+    {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6"},
+    {file = "pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832"},
+    {file = "pyarrow-19.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960"},
+    {file = "pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c"},
+    {file = "pyarrow-19.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae"},
+    {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4"},
+    {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2"},
+    {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6"},
+    {file = "pyarrow-19.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136"},
+    {file = "pyarrow-19.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef"},
+    {file = "pyarrow-19.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0"},
+    {file = "pyarrow-19.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9"},
+    {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3"},
+    {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6"},
+    {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a"},
+    {file = "pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8"},
+    {file = "pyarrow-19.0.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b9766a47a9cb56fefe95cb27f535038b5a195707a08bf61b180e642324963b46"},
+    {file = "pyarrow-19.0.1-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:6c5941c1aac89a6c2f2b16cd64fe76bcdb94b2b1e99ca6459de4e6f07638d755"},
+    {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd44d66093a239358d07c42a91eebf5015aa54fccba959db899f932218ac9cc8"},
+    {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:335d170e050bcc7da867a1ed8ffb8b44c57aaa6e0843b156a501298657b1e972"},
+    {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:1c7556165bd38cf0cd992df2636f8bcdd2d4b26916c6b7e646101aff3c16f76f"},
+    {file = "pyarrow-19.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:699799f9c80bebcf1da0983ba86d7f289c5a2a5c04b945e2f2bcf7e874a91911"},
+    {file = "pyarrow-19.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429"},
+    {file = "pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e"},
+]
+
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
 [[package]]
 name = "pydantic"
 version = "2.10.6"
@@ -1352,26 +1402,6 @@ engineering = ["unyt"]
 spark = ["pyspark"]
 test = ["hypothesis (>=4.4.0)", "interrogate", "pandas-vet", "polars", "py (>=1.10.0)", "pytest (>=3.4.2)", "pytest-cov", "pytest-xdist"]
 
-[[package]]
-name = "pytest"
-version = "8.3.5"
-description = "pytest: simple powerful testing with Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
-    {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "sys_platform == \"win32\""}
-iniconfig = "*"
-packaging = "*"
-pluggy = ">=1.5,<2"
-
-[package.extras]
-dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
-
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -2044,4 +2074,4 @@ docs = ["Sphinx", "elementpath (>=4.4.0,<5.0.0)", "jinja2", "sphinx-rtd-theme"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "e0a6ec98d0ae5e0b527c1115f243b741c2184b8f20b14c196ed4f914565627ef"
+content-hash = "012e3e32e40fecd537318715c06c627697764ac0f1120d98ca8f375ee1c3781a"
diff --git a/pyproject.toml b/pyproject.toml
index 75b7227..f77b3a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,8 @@ python = "^3.12"
 polars = "^1.24.0"
 pyjanitor = "^0.30.0"
 seedcase-sprout = {git = "https://github.com/seedcase-project/seedcase-sprout.git"}
+requests = "^2.32.3"
+fastexcel = "^0.13.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.6.2"
diff --git a/scripts/convert-meta.py b/scripts/convert-meta.py
new file mode 100644
index 0000000..d928e58
--- /dev/null
+++ b/scripts/convert-meta.py
@@ -0,0 +1,188 @@
+from pathlib import Path
+
+import janitor.polars
+import polars as pl
+
+# Set the folder path for raw-data
+resource_dir = Path(__file__).resolve().parent.parent
+folder_path = resource_dir / "data-raw"
+unzip_path = folder_path / "downloaded"
+
+# Create the infant metadata files
+# FIO All infants in meta_infant_ur are included in meta_infant_bl.
+# There is one extra infant in meta_infant_bl.
+df_meta_infant_bl = pl.read_csv(unzip_path / "metadata_infant_blood.csv")
+df_meta_infant_ur = pl.read_csv(unzip_path / "metadata_infant_urine.csv")
+
+df_infant = (
+    df_meta_infant_bl.select(
+        ["Infant_ID", "Mother_ID", "GD_Delivery", "Fostered", "Foster_ID"]
+    )
+    .rename({"GD_Delivery": "gestation_day_delivery"})
+    .unique()
+    .clean_names()
+    .write_csv(folder_path / "data_infant_meta.csv", separator=";")
+)
+# infant gender is in data_adult at present, all subjects are male,
+# females are not included in the study
+
+df_infant_w1 = (
+    df_meta_infant_bl.select(["Infant_ID", "Weight_PD7", "ActualDay_PD7"])
+    .rename({"Weight_PD7": "weight_at_x_days_old", "ActualDay_PD7": "x_days_old"})
+    .clean_names()
+)
+df_infant_w2 = (
+    df_meta_infant_ur.select(["Infant_ID", "Infant_weight", "PD"])
+    .rename({"Infant_weight": "weight_at_x_days_old", "PD": "x_days_old"})
+    .clean_names()
+)
+df_infant_weight = (
+    pl.concat([df_infant_w1, df_infant_w2])
+    .unique()
+    .clean_names()
+    .write_csv(folder_path / "data_infant_weight.csv", separator=";")
+)  # Concatenate (long)
+
+# Create the infant linking tables
+
+matter1 = "Blood"
+matter2 = "Urine"
+
+df_link_infant1 = df_meta_infant_bl.select(
+    ["Infant_ID", "Exp", "PD", "Batch"]
+).with_columns(pl.lit(matter1).alias("type_of_matter"))
+df_link_infant2 = df_meta_infant_ur.select(
+    ["Infant_ID", "Exp", "PD", "Batch"]
+).with_columns(pl.lit(matter2).alias("type_of_matter"))
+
+df_infant_link = (
+    pl.concat([df_link_infant1, df_link_infant2])  # Concatenate long
+    .rename({"PD": "day_sample_taken", "Exp": "infant_sample_id"})
+    .clean_names()
+    .write_csv(folder_path / "data_infant_sample_meta.csv", separator=";")
+)
+
+# Create the adult metadata files
+# FIO There are no additional Mother_ID in file 11
+df_metadata_maternal_bl = pl.read_csv(unzip_path / "metadata_maternal_blood.csv")
+df_metadata_maternal_ur = pl.read_csv(unzip_path / "metadata_maternal_urine.csv")
+df_metadata_maternal_pl = pl.read_csv(unzip_path / "metadata_maternal_placenta.csv")
+
+# Adult metadata file
+df_adult_meta1 = df_metadata_maternal_bl.select(
+    [
+        "Mother_ID",
+        "Mother_age",
+        "GD_Delivery",
+        "Group",
+        "Mode_birth",
+        "Reject",
+        "Infant_sex",
+    ]
+)
+df_adult_meta2 = df_metadata_maternal_ur.select(
+    [
+        "Mother_ID",
+        "Mother_age",
+        "GD_Delivery",
+        "Group",
+        "Mode_birth",
+        "Reject",
+        "Infant_sex",
+    ]
+)
+
+df_adult = (
+    pl.concat([df_adult_meta1, df_adult_meta2])  # Concatenate long
+    .rename(
+        {
+            "Mother_age": "Age_at_conception",
+            "GD_Delivery": "gestation_day_at_delivery",
+            "Group": "obesity_classification",
+            "Mode_birth": "mode_of_birth",
+        }
+    )
+    .clean_names()
+    .unique()
+    .write_csv(folder_path / "data_adult_meta.csv", separator=";")
+)
+# Adult weight file
+df_adult_weight1 = df_metadata_maternal_bl.select(
+    ["Mother_ID", "GD_day", "GD_targeted", "Mother_Weight", "BCS"]
+).rename(
+    {
+        "GD_day": "sample_gestation_day",
+        "GD_targeted": "target_gestation_day",
+        "Mother_Weight": "weight_at_gestation_day",
+        "BCS": "body_condition_score",
+    }
+)
+df_adult_weight2 = df_metadata_maternal_ur.select(
+    ["Mother_ID", "GD", "Target_GD", "Mother_Weight", "BCS"]
+).rename(
+    {
+        "GD": "sample_gestation_day",
+        "Target_GD": "target_gestation_day",
+        "Mother_Weight": "weight_at_gestation_day",
+        "BCS": "body_condition_score",
+    }
+)
+df_adult_weight = (
+    pl.concat([df_adult_weight1, df_adult_weight2])
+    .unique()
+    .write_csv(folder_path / "data_adult_weight.csv", separator=";")
+)  # Concatenate long
+
+# Create the adult linking tables
+
+# matter1 = "Blood" define above in infant linking tables
+# matter2 = "Urine" define above in infant linking tables
+matter3 = "Placenta"
+
+df_link_adult1 = (
+    df_metadata_maternal_bl.select(
+        ["Mother_ID", "Exp", "GD_day", "GD_targeted", "Batch", "Dilution_factor"]
+    )
+    .rename({"GD_day": "GD", "GD_targeted": "Target_GD"})
+    .with_columns(pl.lit(matter1).alias("type_of_matter"))
+)
+df_link_adult2 = df_metadata_maternal_ur.select(
+    ["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"]
+).with_columns(pl.lit(matter2).alias("type_of_matter"))
+df_link_adult3 = df_metadata_maternal_pl.select(
+    ["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"]
+).with_columns(pl.lit(matter3).alias("type_of_matter"))
+
+df_adult_link = (
+    pl.concat([df_link_adult1, df_link_adult2, df_link_adult3])  # Concatenate long
+    .rename(
+        {
+            "GD": "day_sample_taken",
+            "Target_GD": "target_sampling_day",
+            "Exp": "adult_sample_id",
+        }
+    )
+    .write_csv(folder_path / "data_adult_sample_meta.csv", separator=";")
+)
+
+# Create the placenta file
+# Both from metadata_maternal_bl (Placenta_Width,Placenta_Height,
+#   Placenta_Thickness,EPV) and from 11 (see below)
+# More than one measure in metadata_maternal_bl?
+"""
+11 - Metadata_Maternal_placenta.csv
+* Variables 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.placenta.csv file.
+o Mother_ID: IDs of mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o GD: Exact gestational day (GD) when samples were collected.
+o Target_GD: Target GD for sample collection. 
+o Dilution_factor: Dilution factor used to prepare NMR samples. 
+o BCS: Body Condition Score (BCS) 
+o Tissue_weight: Weight of placental tissue sample. 
+o V1: Volume of solvent used to extract (uL). Used to correct the metabolite concentration.
+o V2: Volume of polar layer (methanol + water) collected (uL). Used to correct the metabolite concentration.
+o V3: Buffer added to reconstitute the sample after freeze drying (uL). Used to correct the metabolite concentration. 
+* Missing data codes: Indicated by NAs.
+"""
diff --git a/scripts/convert-samples.py b/scripts/convert-samples.py
new file mode 100644
index 0000000..b6717d1
--- /dev/null
+++ b/scripts/convert-samples.py
@@ -0,0 +1,170 @@
+from pathlib import Path
+
+import janitor.polars
+import polars as pl
+import polars.selectors as cs
+
+# Set the folder path for raw-data
+resource_dir = Path(__file__).resolve().parent.parent
+folder_path = resource_dir / "data-raw"
+unzip_path = folder_path / "downloaded"
+
+# Create the metabolite data files
+
+
+# Transpose a single file
+def transpose_data(load_file_name: Path, save_file_name: Path, monkey_type: str):
+    """Reads a CSV file, transposes it, and writes the result to another CSV."""
+    pl.read_csv(unzip_path / load_file_name).transpose(
+        include_header=True, header_name=monkey_type, column_names="Metabolite"
+    ).write_csv(folder_path / save_file_name)
+
+
+# Find files and generate corresponding output filenames
+load_file_names = list(unzip_path.glob("concentration_*.csv"))
+save_file_names = [folder_path / f"data_{file.name}" for file in load_file_names]
+
+# Extract 'maternal' or 'infant' from the filenames
+monkey_types = [file.stem.split("_")[1] + "_sample_id" for file in load_file_names]
+
+list(map(transpose_data, load_file_names, save_file_names, monkey_types))
+
+# Cortisol data
+df_cortisol_infant_bl = pl.read_excel(unzip_path / "cortisol_infant_blood.xlsx")
+
+df_cortisol = df_cortisol_infant_bl.unpivot(
+    ["samp1", "samp2", "samp3", "samp4"], index=["Infant_ID", "PD"]
+)
+
+time_mapping = {"samp1": "02:00", "samp2": "07:00", "samp3": "11:30", "samp4": "23:30"}
+
+df_cortisol = (
+    df_cortisol.with_columns(
+        pl.col("variable").replace_strict(time_mapping).alias("hours_into_sampling")
+    )
+    .select(["Infant_ID", "PD", "hours_into_sampling", "value"])
+    .rename(
+        {
+            "value": "cortisol_level",
+            "Infant_ID": "infant_id",
+            "PD": "post_gestation_day",
+        }
+    )
+    .clean_names()
+    .write_csv(folder_path / "data_cortisol_infant_blood.csv")
+)
+# Cytokine data both sets
+df_cytokine_infant_bl = pl.read_csv(unzip_path / "cytokine_infant_blood.csv")
+df_cytokine_infant_bl = (
+    df_cytokine_infant_bl.drop(
+        [
+            "Group",
+            "PD",
+            "Target_PD",
+            "Mother_ID",
+            "GD_Delivery",
+            "Mode_birth",
+            "Fostered",
+            "Foster_ID",
+        ]
+    )
+    .clean_names()
+    .write_csv(folder_path / "data_cytokine_infant_bl.csv", separator=";")
+)
+
+df_cytokine_maternal_bl = pl.read_csv(unzip_path / "cytokine_maternal_blood.csv")
+
+df_cytokine_maternal_bl = (
+    df_cytokine_maternal_bl.drop(["Group", "GD", "Target_GD"])
+    .clean_names()
+    .write_csv(folder_path / "data_cytokine_maternal_blood.csv", separator=";")
+)
+# May want to have a closer look at 14 at some point
+
+# Human intruder testing
+df_infant_behavior = pl.read_excel(unzip_path / "hi_infant_behavior.xlsx")
+
+profile_mapping = {
+    "pfscratch": "Profile-Far",
+    "pnscratch": "Profile-Near",
+    "sfscratch": "Stare-Far",
+    "snscratch": "Stare-Near",
+}
+
+df_behavior = df_infant_behavior.unpivot(
+    ["pfscratch", "pnscratch", "sfscratch", "snscratch"], index=["Infant_ID", "PD"]
+)
+
+df_behavior = (
+    df_behavior.with_columns(
+        pl.col("variable")
+        .replace_strict(profile_mapping)
+        .alias("presentation_of_profile")
+    )
+    .select(["Infant_ID", "PD", "presentation_of_profile", "value"])
+    .rename(
+        {"value": "scratches", "Infant_ID": "infant_id", "PD": "post_gestation_day"}
+    )
+    .clean_names()
+    .write_csv(folder_path / "data_hi_infant_behavior.csv")
+)
+
+# Cognition testing of infants
+df_infant_cognitive = pl.read_excel(unzip_path / "vpc_infant_cognitive.xlsx")
+
+df_cognitive = df_infant_cognitive.unpivot(
+    [
+        "P1T1 Total number Look RIGHT FAM",
+        "P1T1 Total number Look LEFT NOVEL",
+        "P1T1 Total number Look Away",
+        "P1T2 Total number Look RIGHT NOVEL",
+        "P1T2 Total number Look LEFT FAM",
+        "P1T2 Total number Look Away",
+        "P2T1 Total number Look RIGHT NOVEL",
+        "P2T1 Total number Look LEFT FAM",
+        "P2T1 Total number Look Away",
+        "P2T2 Total number Look RIGHT FAM",
+        "P2T2 Total number Look LEFT NOVEL",
+        "P2T2 Total number Look Away",
+        "P3T1 Total number Look RIGHT NOVEL",
+        "P3T1 Total number Look LEFT FAM",
+        "P3T1 Total number Look Away",
+        "P3T2 Total number Look RIGHT FAM",
+        "P3T2 Total number Look LEFT NOVEL",
+        "P3T2 Total number Look Away",
+        "P4T1 Total number Look RIGHT FAM",
+        "P4T1 Total number Look LEFT NOVEL",
+        "P4T1 Total number Look Away",
+        "P4T2 Total number Look RIGHT NOVEL",
+        "P4T2 Total number Look LEFT FAM",
+        "P4T2 Total number Look Away",
+        "no.look_N",
+        "no.look_F",
+        "no.look_N+F",
+        "no.look_N/N+F",
+    ],
+    index=["Infant_ID", "GD_delivery", "Batch", "PCD"],
+)
+
+df_cognitive = (
+    df_cognitive.with_columns(pl.col("variable"))
+    .select(["Infant_ID", "GD_delivery", "Batch", "PCD", "variable", "value"])
+    .rename(
+        {
+            "value": "Looks",
+            "variable": "type_of_test",
+            "Infant_ID": "infant_id",
+            "GD_delivery": "gestation_day_at_delivery",
+            "PCD": "post_conception_day",
+        }
+    )
+    .clean_names()
+    .write_csv(folder_path / "data_vpc_infant_cognitive.csv")
+)
+
+# TO DO
+# 9 Meta maternal blood
+# Batch,Dilution_factor
+# 10 Meta maternal urine
+# Batch,Dilution_factor
+# Check if there are more weight measures to be found in 15
diff --git a/scripts/download-data.py b/scripts/download-data.py
new file mode 100644
index 0000000..8905155
--- /dev/null
+++ b/scripts/download-data.py
@@ -0,0 +1,255 @@
+"""Downloads a data file from a given URL.
+
+This is the most basic of the download data scripts. It requires that the url for
+the data is provided as an argument, and it assumes that a data-raw folder
+has been created containing a .gitignore file.
+"""
+
+import os
+from pathlib import Path
+from zipfile import ZipFile
+
+import requests
+
+resource_dir = Path(__file__).resolve().parent.parent
+folder_path = resource_dir / "data-raw"
+unzip_path = folder_path / "downloaded"
+
+# Download and save the zip file
+all_files = requests.get("https://zenodo.org/api/records/7055715/files-archive")
+
+all_files_path = unzip_path / "all_files.zip"
+with open(all_files_path, "wb") as file:
+    file.write(all_files.content)
+
+# Extract the zip file
+with ZipFile(all_files_path, "r") as zip_ref:
+    zip_ref.extractall(unzip_path)
+
+# Rename the files to snake_case
+for file in unzip_path.iterdir():
+    if file.name.endswith((".csv", ".xlsx")):
+        new_name = file.name.lower().replace(".", "_", 1)
+        file.rename(unzip_path / new_name)
+
+"""
+1 - Concentration_Infant.blood.csv
+* Columns: Samples. The same “Exp” represent the same sample in Metadata_Infant.blood.csv & Cytokine_Infant.blood.csv files.
+* Rows: Metabolites
+* The unit is uM. 
+
+2 - Concentration_Infant.urine.csv
+* Columns: Samples. The same “Exp” represent the same sample in Metadata_Infant.urine.csv file.
+* Rows: Metabolites
+* The unit is uM. 
+
+3 - Concentration_Maternal.blood.csv
+* Columns: Sample IDs. The same “Exp” represent the same sample in Concentration_Maternal.blood.csv, Cytokine_Infant.blood.csv, and Metadata_Maternal.blood.csv files. 
+* Rows: Metabolites
+* The unit is uM. 
+
+4 - Concentration_Maternal.placenta.csv
+* Columns: Samples. The same “Exp” represent the same sample in Metadata_Maternal.placenta.csv file.
+* Rows: Metabolites
+* The unit is uM.
+
+5 - Concentration_Maternal.urine.csv
+* Columns: Samples. The same “Exp” represent the same sample in Metadata_Maternal.urine.csv file.
+* Rows: Metabolites
+* The unit is uM. 
+
+6 - Cortisol_Infant.blood.csv
+* Variables 
+o Infant_ID: IDs of infants. 
+o Group: Lean or Obese.
+o PD: Exact postnatal day (PD) when samples were collected.
+o Mother_ID: IDs of mothers. 
+o Foster_ID: IDs of foster mothers. 
+o Infant_weight: Infant weights (kg) recorded at samples collection. 
+o samp1: Cortisol level from the 1st blood samples collected from infants at 11am, after being separated from mothers at 9am. 
+o samp2: Cortisol level from the 2nd blood samples collected 4 pm. After the blood collection, 500 ?g/kg dexamethasone was injected intramuscularly. 
+o samp3: Cortisol level from the 3rd blood samples collected at 8:30 am of the following day. After the blood collection, 2.5 IU of adrenocorticotropic hormone (ACTH) was injected intramuscularly.
+o samp4: Cortisol level from the 4th blood samples collected 30 min after ACTH injection. 
+o pctsuppression: Values obtained by dividing samp3 by samp2.
+* Missing data codes: Indicated by NAs.
+
+7 - Metadata_Infant.blood.csv
+* Variables 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Infant.blood.csv & Cytokine_Infant.blood.csv files.
+o Infant_ID: IDs of infants. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o PD: Exact postnatal day (PD) when samples were collected.
+o Target_PD: Target PD for sample collection. 
+o Dilution_factor: Dilution factor used to prepare NMR samples. 
+o Mother_ID: IDs of mothers. 
+o GD_Delivery: Gestational day at delivery. 
+o Mode_birth: Mode of delivery.
+o Fostered: Yes (fostered) or No (was not fostered).
+o Foster_ID: IDs of foster mothers. 
+o Weight_PD7: Infant weights recorded at around PD7.
+o ActualDay_PD7: Actual day for “Weight_PD7”. 
+o Infant_weight: Infant weights (kg) recorded at samples collection. 
+* Missing data codes: Indicated by NAs.
+
+8 - Metadata_Infant.urine.csv
+* Variables 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Infant.urine.csv file.
+o Infant_ID: IDs of infants. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o PD: Exact postnatal day (PD) when samples were collected.
+o Target_PD: Target PD for sample collection. 
+o Dilution_factor: Dilution factor used to prepare NMR samples. 
+o Mother_ID: IDs of mothers. 
+o GD_Delivery: Gestational day at delivery. 
+o Mode_birth: Mode of delivery.
+o Fostered: Yes (fostered) or No (was not fostered).
+o Foster_ID: IDs of foster mothers. 
+o Infant_weight: Infant weights (kg) recorded at samples collection. 
+* Missing data codes: Indicated by NAs.
+
+9 - Metadata_Maternal.blood.csv
+* Variables 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.blood.csv, Cytokine_Infant.blood.csv, and Metadata_Maternal.blood.csv files.
+o Mother_ID: IDs of mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o GD: Exact gestational day (GD) when samples were collected.
+o Target_GD: Target GD for sample collection. 
+o Dilution_factor: Dilution factor used to prepare NMR samples. 
+o BCS: Body Condition Score (BCS) 
+o Mother_Weight: Maternal weight in kg at sample collection. 
+o Mother_age: Maternal age at conception. 
+o GD_Delivery: Gestational day at delivery. 
+o Infant_sex: Infant sex.
+o Mode_birth: Mode of delivery.
+o Reject: Whether mothers rejected infants or not. 
+o Placenta_Width: Width of placenta at sample collection.
+o Placenta_Height: Height of placenta at sample collection.
+o Placenta_Thickness: Thickness of placenta at sample collection.
+o EPV: Estimated placental volume. 
+* Missing data codes: Indicated by NAs.
+
+10 - Metadata_Maternal.urine.csv
+* Variables 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.urine.csv file.
+o Mother_ID: IDs of mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o GD: Exact gestational day (GD) when samples were collected.
+o Target_GD: Target GD for sample collection. 
+o Dilution_factor: Dilution factor used to prepare NMR samples. 
+o BCS: Body Condition Score (BCS) 
+o Mother_Weight: Maternal weight in kg at sample collection. 
+o Mother_age: Maternal age at conception. 
+o GD_Delivery: Gestational day at delivery. 
+o Infant_sex: Infant sex.
+o Mode_birth: Mode of delivery.
+* Missing data codes: Indicated by NAs.
+
+11 - Metadata_Maternal.placenta.csv
+* Variables 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.placenta.csv file.
+o Mother_ID: IDs of mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o GD: Exact gestational day (GD) when samples were collected.
+o Target_GD: Target GD for sample collection. 
+o Dilution_factor: Dilution factor used to prepare NMR samples. 
+o BCS: Body Condition Score (BCS) 
+o Tissue_weight: Weight of placental tissue sample. 
+o V1: Volume of solvent used to extract (uL). Used to correct the metabolite concentration.
+o V2: Volume of polar layer (methanol + water) collected (uL). Used to correct the metabolite concentration.
+o V3: Buffer added to reconstitute the sample after freeze drying (uL). Used to correct the metabolite concentration. 
+* Missing data codes: Indicated by NAs.
+
+12 - Cytokine_Infant.blood
+* Variables: 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Infant.blood.csv & Metadata_Infant.blood.csv files.
+o Infant_ID: IDs of infants. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o PD: Exact postnatal day (PD) when samples were collected.
+o Target_PD: Target PD for sample collection. 
+o Mother_ID: IDs of mothers.
+o GD_Delivery: Gestational day at delivery. 
+o Mode_birth: Mode of delivery.
+o Fostered: Yes (fostered) or No (was not fostered).
+o Foster_ID: IDs of foster mothers. 
+o Weight_PD7: Infant weights recorded at around PD7.
+o hsCPP, GM_CSF, IFN_g, IL_1b, IL_ra, IL_2, IL_4, IL_5, IL_6, IL_8, IL_10, IL_12.23_p40, IL_13, IL_15, 
+IL_17a, MCP_1, MIP_1b, sCD40L_38, TGFa, TNFa, VEGF, C_Peptide, GIP, Inflammatory markers. 
+o Insulin: Insulin level (pg/mL).
+* Missing data codes: Indicated by NAs.
+* Specialized formats or other abbreviations used: GD, gestational day; BCS, Body Condition Score; 
+GM-CSF, granulocyte-macrophage colony-stimulating factor; IFN- ?, interferon ?; TNF-?, tumor necrosis factor-?; 
+TGF-?, transforming growth factor-?; MCP-1, monocyte chemoattractant protein-1; MIP-1?, macrophage inflammatory protein-1?; 
+hs-CRP, high-sensitivity C-reactive protein; IL, interleukin.
+
+13 - Cytokine_Maternal.blood
+* Variables: 
+o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.blood.csv, Cytokine_Infant.blood.csv, and Metadata_Maternal.blood.csv files.
+o Mother_ID: IDs of mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o GD: Exact gestational day (GD) when samples were collected.
+o Target_GD: Target GD for sample collection. 
+o BCS: Body Condition Score (BCS) 
+o hsCPP, GM_CSF, IFN_g, IL_1b, IL_ra, IL_2, IL_6, IL_8, IL_10, IL_12/23_p40, IL_13, IL_15, 
+IL_17a, MCP_1, MIP_1b, sCD40L, TGFa, VEGF, C_Peptide, GIP, PP_53, PYY_54: Inflammatory markers. 
+o Insulin: Insulin level (uU/mL)
+* Missing data codes: Indicated by NAs.
+* Specialized formats or other abbreviations used: GD, gestational day; BCS, Body Condition Score; 
+hs-CRP, high-sensitivity C-reactive protein; GM_CSF, granulocyte-macrophage colony-stimulating factor; 
+IFN- ?, interferon-?; TNF-?, tumor necrosis factor-?; TGF-?, transforming growth factor-?; 
+MCP-1, monocyte chemoattractant protein-1; MIP-1?, macrophage inflammatory protein-1?; IL, interleukin; 
+IL-1ra, IL-1 receptor antagonist.
+
+14 - WB.infant.brain
+* Variables
+o Infant_ID: IDs of infants.
+o Brain_region: Amygdala, Hippocampus, Hypothalamus, P.Cortex (= prefrontal cortex)
+o Group: Lean or Obese.
+o Akt, p.Akt, AMPK, p.AMPK, S6K, p.S6K: Normalized relative intensity levels.
+
+15 - HI_infant.behavior
+* Variables 
+o Infant_ID: IDs of infants. 
+o Group: Lean or Obese.
+o PD: Exact postnatal day (PD) when samples were collected.
+o Mother_ID: IDs of mothers. 
+o Foster_ID: IDs of foster mothers. 
+o Infant_weight: Infant weights (kg) recorded at samples collection. 
+o pfscratch: Profile-Far (technician presented the left profile from ~1 m away from an infant in a cage)
+o pnscratch: Profile-Near (presented left profile from ~0.3 m)
+o sfscratch: Stare-Far (made direct eye contact with the animal from far)
+o snscratch: Stare-Near (direct eye contact from near position)
+* Missing data codes: Indicated by NAs.
+
+16 - VPC_infant_cognitive
+* Variables 
+o Infant_ID: IDs of infants. 
+o Mother_ID: IDs of mothers. 
+o Foster_ID: IDs of foster mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o GD_Delivery: Gestational day at delivery. 
+o PCD: post-conception day.
+o PxTx Total number Look RIGHT/LEFT FAM: Total number of looks at a familiar object that was placed right or left side. P stands for the number of problem (1 to 4) and T stands for the number of Trial (1 or 2). 
+o PxTx Total number Look RIGHT/LEFT NOVEL: Total number of looks at a novel object that was placed right or left side. P stands for the number of problem (1 to 4) and T stands for the number of Trial (1 or 2).
+o PxTx Total number Look Away: Total number of looks away from either of the object. P stands for the number of problem (1 to 4) and T stands for the number of Trial (1 or 2).
+o no.looks_N: Total number of looks at novel object throughout the problems and trials. 
+o no.looks_F: Total number of looks at familiar object throughout the problems and trials.
+o no.looks_N+F: Total number of looks at novel and familiar object throughout the problems and trials.
+o no.looks_N/N+F: Novelty preference calculated as: number of fixations at the novel stimulus (no.looks_N)/number of fixations at both the novel and familiar stimulus (no.looks_N+F).
+
+17 - Gestational.weight.gain.rate
+* Variables 
+o Mother_ID: IDs of mothers. 
+o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
+o Group: Lean or Obese.
+o Infant_ID: IDs of infants. 
+o Foster_ID: IDs of foster mothers. 
+o GWG: Gestational weight gain rate. 
+"""