In [24]:
import json
import glob
import os

def fetch_all_keys(json_path):
    """Recursively fetch all keys from a JSON file."""
    keys = set()

    def _recursive_extract(obj, prefix=""):
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                keys.add(full_key)
                _recursive_extract(v, prefix=full_key)
        elif isinstance(obj, list):
            for item in obj:
                _recursive_extract(item, prefix=prefix)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        _recursive_extract(data)

    return keys

# 🔥 Dynamically fetch keys from all MODEL_PROVENANCE run summaries
all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")

collected_keys = set()
for json_file in all_json_files:
    keys = fetch_all_keys(json_file)
    collected_keys.update(keys)

# ✅ Now build the mapping
mapping = {key: {"@id": key} for key in collected_keys}

# Special case for timestamps
if "start_time" in mapping:
    mapping["start_time"]["@id"] = "prov:startedAtTime"
    mapping["start_time"]["@type"] = "xsd:dateTime"
if "end_time" in mapping:
    mapping["end_time"]["@id"] = "prov:endedAtTime"
    mapping["end_time"]["@type"] = "xsd:dateTime"

# 🔥 Save mapping dynamically
os.makedirs("mappings", exist_ok=True)
with open("mappings/full_mapping.json", "w", encoding="utf-8") as f:
    json.dump(mapping, f, indent=2)

print(f"✅ Full dynamic mapping file created: mappings/full_mapping.json with {len(mapping)} fields!")


✅ Full dynamic mapping file created: mappings/full_mapping.json with 149 fields!


In [25]:
import os
import glob
import json
from datetime import datetime, timezone
from rdflib import Graph

import os
def iso8601(ms):
    """Convert milliseconds since epoch to ISO8601 UTC."""
    return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()
# Load the context mapping
with open("mappings/full_mapping.json", "r", encoding="utf-8") as f:
    ctx = json.load(f)

# Loop through your run_summary files
for json_path in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
    basename   = os.path.basename(json_path)
    model_name = basename.rsplit("_run_summary.json", 1)[0]

    with open(json_path, "r", encoding="utf-8") as f:
        summary = json.load(f)

    doc = {
        "@context": ctx,
        "run_id": summary.get("run_id", ""),
        "run_name": summary.get("run_name", ""),
        "experiment_id": summary.get("experiment_id", ""),
        "params": summary.get("params", {}),
        "metrics": summary.get("metrics", {}),
        "artifacts": summary.get("artifacts", []),
        "tags": summary.get("tags", {}),
        "start_time": iso8601(summary["start_time"])
    }

    if summary.get("end_time") is not None:
        doc["end_time"] = iso8601(summary["end_time"])

    doc["used"] = summary.get("tags", {}).get("dataset_uri") or []
    doc["generated"] = [
        art.get("uri") or art.get("path")
        for art in summary.get("artifacts", [])
    ]

    # Save .jsonld
    out_jsonld = os.path.join("MODEL_PROVENANCE", model_name, f"{model_name}.jsonld")
    with open(out_jsonld, "w", encoding="utf-8") as f:
        json.dump(doc, f, indent=2)

    # Save .ttl
    g = Graph().parse(data=json.dumps(doc), format="json-ld")
    out_ttl = os.path.join("MODEL_PROVENANCE", model_name, f"{model_name}.ttl")
    g.serialize(destination=out_ttl, format="turtle")

    print(f"✅ Converted {basename} → {os.path.basename(out_jsonld)}, {os.path.basename(out_ttl)}")


✅ Converted RandomForest_Iris_v20250425_121328_run_summary.json → RandomForest_Iris_v20250425_121328.jsonld, RandomForest_Iris_v20250425_121328.ttl
✅ Converted RandomForest_Iris_v20250425_125653_run_summary.json → RandomForest_Iris_v20250425_125653.jsonld, RandomForest_Iris_v20250425_125653.ttl
✅ Converted RandomForest_Iris_v20250425_131407_run_summary.json → RandomForest_Iris_v20250425_131407.jsonld, RandomForest_Iris_v20250425_131407.ttl
✅ Converted RandomForest_Iris_v20250425_132526_run_summary.json → RandomForest_Iris_v20250425_132526.jsonld, RandomForest_Iris_v20250425_132526.ttl
✅ Converted RandomForest_Iris_v20250425_135553_run_summary.json → RandomForest_Iris_v20250425_135553.jsonld, RandomForest_Iris_v20250425_135553.ttl
✅ Converted RandomForest_Iris_v20250425_135900_run_summary.json → RandomForest_Iris_v20250425_135900.jsonld, RandomForest_Iris_v20250425_135900.ttl


In [19]:
import os
import json
import glob
import pandas as pd
from rdflib import Graph

# ---------- Helper functions -------------

def load_as_dict(path):
    """Load a JSON or JSON-LD/Turtle file as dictionary."""
    if path.endswith((".ttl", ".turtle")):
        g = Graph()
        g.parse(path, format="turtle")
        return json.loads(g.serialize(format="json-ld", indent=2))
    else:
        with open(path, encoding="utf-8") as f:
            return json.load(f)

def compare_json(a, b, path=""):
    """Recursively compare two JSON structures."""
    diffs = []
    if isinstance(a, dict) and isinstance(b, dict):
        a = {k: v for k, v in a.items() if k != "@context"}
        b = {k: v for k, v in b.items() if k != "@context"}
        all_keys = set(a) | set(b)
        for k in all_keys:
            new_path = f"{path}/{k}" if path else k
            if k not in a:
                diffs.append({"path": new_path, "type": "added", "a": None, "b": b[k]})
            elif k not in b:
                diffs.append({"path": new_path, "type": "removed", "a": a[k], "b": None})
            else:
                diffs.extend(compare_json(a[k], b[k], new_path))
    elif isinstance(a, list) and isinstance(b, list):
        for i, (ia, ib) in enumerate(zip(a, b)):
            diffs.extend(compare_json(ia, ib, f"{path}[{i}]"))
        if len(a) < len(b):
            for i in range(len(a), len(b)):
                diffs.append({"path": f"{path}[{i}]", "type": "added", "a": None, "b": b[i]})
        elif len(a) > len(b):
            for i in range(len(b), len(a)):
                diffs.append({"path": f"{path}[{i}]", "type": "removed", "a": a[i], "b": None})
    else:
        if a != b:
            diffs.append({"path": path, "type": "changed", "a": a, "b": b})
    return diffs

# ---------- Main comparison -------------

# 1. Scan for all run folders
base_dir = "MODEL_PROVENANCE"
runs = [d for d in glob.glob(os.path.join(base_dir, "*")) if os.path.isdir(d)]

# 2. Compare only JSON vs JSON-LD
all_diffs = []

for run_dir in runs:
    model_name = os.path.basename(run_dir)
    
    json_path = os.path.join(run_dir, f"{model_name}_run_summary.json")
    jsonld_path = os.path.join(run_dir, f"{model_name}.jsonld")

    if os.path.exists(json_path) and os.path.exists(jsonld_path):
        try:
            json_obj = load_as_dict(json_path)
            jsonld_obj = load_as_dict(jsonld_path)

            diffs = compare_json(json_obj, jsonld_obj)
            if diffs:
                print(f"\n🔎 Differences for {model_name}: {len(diffs)} differences found")
                all_diffs.extend(diffs)
            else:
                print(f"✅ {model_name}: No differences detected")
        
        except Exception as e:
            print(f"❌ Error comparing {model_name}: {e}")

    else:
        print(f"⚠️ Missing files in {model_name}: Skipping.")

# 3. Summarize if needed
if all_diffs:
    df_diffs = pd.DataFrame(all_diffs)
    print("\nSummary of all differences:")
    print(df_diffs['type'].value_counts())
else:
    print("\n🎉 All JSON and JSON-LD files match perfectly!")




🔎 Differences for RandomForest_Iris_v20250425_121328: 4 differences found

🔎 Differences for RandomForest_Iris_v20250425_125653: 5 differences found

🔎 Differences for RandomForest_Iris_v20250425_131407: 4 differences found

🔎 Differences for RandomForest_Iris_v20250425_132526: 4 differences found

🔎 Differences for RandomForest_Iris_v20250425_135553: 5 differences found

🔎 Differences for RandomForest_Iris_v20250425_135900: 5 differences found

Summary of all differences:
type
added      12
removed     9
changed     6
Name: count, dtype: int64


In [21]:
import json
import glob
import os

def fetch_all_keys(json_path):
    """Recursively fetch all keys from a JSON file."""
    keys = set()

    def _recursive_extract(obj, prefix=""):
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                keys.add(full_key)
                _recursive_extract(v, prefix=full_key)
        elif isinstance(obj, list):
            for item in obj:
                _recursive_extract(item, prefix=prefix)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        _recursive_extract(data)

    return keys

def create_mapping_from_keys(keys):
    """Create a simple mapping where each key maps to itself, with special rules for timestamps."""
    mapping = {}

    for key in sorted(keys):
        if "." not in key:
            # Top-level fields
            if key in ["start_time", "end_time"]:
                mapping[key] = {
                    "@id": f"prov:{'startedAtTime' if key == 'start_time' else 'endedAtTime'}",
                    "@type": "xsd:dateTime"
                }
            elif key in ["run_id", "run_name", "experiment_id"]:
                mapping[key] = {"@id": key}
            else:
                mapping[key] = {"@id": key}
        else:
            # Nested fields
            mapping[key] = {"@id": key}

    # Attach namespaces
    mapping["@context"] = {
        "prov": "http://www.w3.org/ns/prov#",
        "xsd":  "http://www.w3.org/2001/XMLSchema#"
    }

    return mapping

# --- Main execution ---

# 1. Fetch keys from all JSONs
all_keys = set()
for json_path in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
    keys = fetch_all_keys(json_path)
    all_keys.update(keys)

# 2. Create mapping
mapping_dict = create_mapping_from_keys(all_keys)

# 3. Save mapping
output_dir = "mapping_files"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "dynamic_mapping.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(mapping_dict, f, indent=2)

print(f"✅ Mapping file created: {output_path}")


✅ Mapping file created: mapping_files\dynamic_mapping.json


In [22]:
import json
import os
import glob
from rdflib import Graph
from datetime import datetime, timezone

# === Utility functions ===

def iso8601(ms):
    """Convert milliseconds since epoch to ISO8601 UTC."""
    return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()

def load_mapping(mapping_path="mapping_files/dynamic_mapping.json"):
    """Load dynamic mapping file."""
    with open(mapping_path, "r", encoding="utf-8") as f:
        return json.load(f)

def map_json_fields(summary, mapping):
    """Apply dynamic field mapping to summary dict."""
    doc = {"@context": mapping["@context"]}

    for key, map_info in mapping.items():
        if key == "@context":
            continue

        value = get_nested(summary, key)
        if value is not None:
            mapped_key = map_info["@id"]
            # Apply ISO8601 if type is datetime
            if map_info.get("@type") == "xsd:dateTime":
                value = iso8601(value)
            doc[mapped_key] = value

    return doc

def get_nested(data, dotted_key):
    """Safely get nested keys like artifacts.uri."""
    parts = dotted_key.split(".")
    for part in parts:
        if isinstance(data, dict):
            data = data.get(part)
        else:
            return None
    return data

# === Main execution ===

# Load the dynamic mapping
mapping = load_mapping()

# Process all summaries
all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")

for json_path in all_json_files:
    base_dir = os.path.dirname(json_path)
    basename = os.path.basename(json_path)
    model_name = basename.replace("_run_summary.json", "")

    # Load JSON
    with open(json_path, "r", encoding="utf-8") as f:
        summary = json.load(f)

    # Map using dynamic mapping
    jsonld_doc = map_json_fields(summary, mapping)

    # Save as .jsonld
    jsonld_path = os.path.join(base_dir, f"{model_name}.jsonld")
    with open(jsonld_path, "w", encoding="utf-8") as f:
        json.dump(jsonld_doc, f, indent=2)

    # Convert to .ttl
    g = Graph()
    g.parse(data=json.dumps(jsonld_doc), format="json-ld")
    ttl_path = os.path.join(base_dir, f"{model_name}.ttl")
    g.serialize(destination=ttl_path, format="turtle")

    print(f"✅ {model_name}: JSON-LD and TTL generated")

print("🚀 All model runs converted successfully!")


✅ RandomForest_Iris_v20250425_121328: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_125653: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_131407: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_132526: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_135553: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_135900: JSON-LD and TTL generated
🚀 All model runs converted successfully!


In [23]:
import json
from rdflib import Graph

def convert_jsonld_to_rdfxml(jsonld_path, rdfxml_out_path):
    # 1. Load your JSON-LD
    with open(jsonld_path, "r", encoding="utf-8") as f:
        jsonld_data = json.load(f)

    # 2. Parse it into an RDF Graph
    g = Graph()
    g.parse(data=json.dumps(jsonld_data), format="json-ld")

    # 3. Serialize it into RDF/XML
    g.serialize(destination=rdfxml_out_path, format="xml")

    print(f"✅ RDF/XML written to {rdfxml_out_path}")

# Example usage
convert_jsonld_to_rdfxml(
    "MODEL_PROVENANCE/RandomForest_Iris_v20250425_135900/RandomForest_Iris_v20250425_135900.jsonld",
    "MODEL_PROVENANCE/RandomForest_Iris_v20250425_135900/RandomForest_Iris_v20250425_135900.rdf"
)


✅ RDF/XML written to MODEL_PROVENANCE/RandomForest_Iris_v20250425_135900/RandomForest_Iris_v20250425_135900.rdf


In [28]:
import os
import glob
import json

def fetch_all_keys(json_path):
    """Recursively fetch all JSON keys in dot notation."""
    keys = set()

    def _recursive_extract(obj, prefix=""):
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                keys.add(full_key)
                _recursive_extract(v, prefix=full_key)
        elif isinstance(obj, list):
            for item in obj:
                _recursive_extract(item, prefix=prefix)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        _recursive_extract(data)

    return keys

# 🔥 Fetch all keys from MODEL_PROVENANCE
all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")
collected_keys = set()

for json_file in all_json_files:
    keys = fetch_all_keys(json_file)
    collected_keys.update(keys)

print(f"🔎 Found {len(collected_keys)} unique fields across {len(all_json_files)} run summaries.")

# ✨ Auto-build the mapping: JSON key ➔ RDF property
mapping = {}

for key in sorted(collected_keys):
    rdf_key = key.replace(".", "_")  # replace dot with underscore
    mapping[key] = f"prov:{rdf_key}"

# 📂 Save to file
os.makedirs("mappings", exist_ok=True)
mapping_file = os.path.join("mappings", "json_to_rdf_mapping.json")

with open(mapping_file, "w", encoding="utf-8") as f:
    json.dump(mapping, f, indent=2)

print(f"✅ Mapping created and saved at: {mapping_file}")


🔎 Found 149 unique fields across 6 run summaries.
✅ Mapping created and saved at: mappings\json_to_rdf_mapping.json


In [29]:
import json
import glob
import os
from rdflib import Graph, URIRef, Literal, Namespace, RDF
from rdflib.namespace import XSD

# Load your dynamic JSON ➔ RDF mapping
with open("mappings/json_to_rdf_mapping.json", "r", encoding="utf-8") as f:
    field_mapping = json.load(f)

prov = Namespace("http://www.w3.org/ns/prov#")

def flatten_json(obj, parent_key=''):
    """Flatten nested JSON with dot notation."""
    items = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_key = f"{parent_key}.{k}" if parent_key else k
            items.extend(flatten_json(v, new_key))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            new_key = f"{parent_key}[{i}]"
            items.extend(flatten_json(v, new_key))
    else:
        items.append((parent_key, obj))
    return items

def create_rdf_from_json(json_path):
    """Given a run_summary JSON file, create RDF/XML."""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Flatten the JSON
    flat_data = dict(flatten_json(data))

    # Build RDF graph
    g = Graph()
    g.bind("prov", prov)

    # Create a blank subject (could also use the run_id if you want)
    subj = URIRef(f"urn:uuid:{data.get('run_id', 'unknown-run')}")

    for key, value in flat_data.items():
        if key in field_mapping:
            pred = URIRef(field_mapping[key].replace("prov:", str(prov)))
            if isinstance(value, (int, float)):
                obj = Literal(value)
            else:
                obj = Literal(str(value))
            g.add((subj, pred, obj))
        else:
            # Keys that don't have mapping: (skip or warn)
            pass

    return g

# 🔥 Process all run summaries
os.makedirs("rdf_exports", exist_ok=True)

for json_file in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
    model_name = os.path.basename(json_file).replace("_run_summary.json", "")

    rdf_graph = create_rdf_from_json(json_file)

    # Save as RDF/XML
    out_path = os.path.join("rdf_exports", f"{model_name}.rdf")
    rdf_graph.serialize(destination=out_path, format="xml")

    print(f"✅ RDF/XML created: {out_path}")


✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_121328.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_125653.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_131407.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_132526.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_135553.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_135900.rdf


In [37]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [1 lines of output]
  ERROR: Can not execute `setup.py` since setuptools is not available in the build environment.
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

Encountered error while generating package metadata.

See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.


In [34]:
!pip install --upgrade pip setuptools
!pip install python-docx


Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/c9/bc/b7db44f5f39f9d0494071bddae6880eb645970366d0a200022a1a93d57f5/pip-25.0.1-py3-none-any.whl.metadata
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Obtaining dependency information for setuptools from https://files.pythonhosted.org/packages/0d/6d/b4752b044bf94cb802d88a888dc7d288baaf77d7910b7dedda74b5ceea0c/setuptools-79.0.1-py3-none-any.whl.metadata
  Downloading setuptools-79.0.1-py3-none-any.whl.metadata (6.5 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB 2.0 MB/s eta 0:00:01
   --------- ------------------------------ 0.5/1.8 MB 7.1 MB/s eta 0:00:01
   ------------------------ --------------- 1.1/1.8 MB 10.2 MB/s eta 0:00:01
   ---------------------------------------  1.8/1.8 MB 11.7 MB/s eta 0:00:01
   -----------

ERROR: To modify pip, please run the following command:
C:\Users\reema\anaconda3\python.exe -m pip install --upgrade pip setuptools


Collecting python-docx
  Obtaining dependency information for python-docx from https://files.pythonhosted.org/packages/3e/3d/330d9efbdb816d3f60bf2ad92f05e1708e4a1b9abe80461ac3444c83f749/python_docx-1.1.2-py3-none-any.whl.metadata
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
   ---------------------------------------- 0.0/244.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/244.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/244.3 kB ? eta -:--:--
   ----- ---------------------------------- 30.7/244.3 kB ? eta -:--:--
   -------------------------------------- - 235.5/244.3 kB 4.8 MB/s eta 0:00:01
   ---------------------------------------- 244.3/244.3 kB 3.0 MB/s eta 0:00:00
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2




In [36]:
from docx import Document
from fpdf import FPDF

# Create Word Document
doc = Document()
doc.add_heading('Reema Dass', 0)
doc.add_paragraph('Data Scientist | Data Engineer | Data Analyst\n')
doc.add_paragraph('Email: reema.g.dass@gmail.com | Phone: +43 676 7317181 | Location: Vienna, Austria')
doc.add_paragraph('LinkedIn | GitHub | Website\n')

doc.add_heading('Profile Summary:', level=1)
doc.add_paragraph(
    "Master's student in Data Science with four years of professional experience in data engineering, "
    "cloud computing, and machine learning applications. Skilled at transforming complex data into actionable "
    "insights through innovative, scalable solutions. Passionate about solving real-world problems using sophisticated "
    "analytics and driving meaningful impact across organizations."
)

doc.add_heading('Education:', level=1)
doc.add_paragraph('Master\'s in Data Science\nTechnical University of Vienna, Austria | Mar 2022 – Present\n- Majoring in Machine Learning and Visual Analytics')
doc.add_paragraph('Bachelor\'s in Computer Science\nVisvesvaraya Technological University (VTU), India | Sep 2013 – Dec 2017\n- Thesis: Image Processing for Medical Disease Diagnosis')

doc.add_heading('Professional Experience:', level=1)
experiences = [
    ("Data Science Research Intern (FFG-Femtech)", "SBA Research, Vienna | Sep 2023 – Feb 2024",
     "- Investigated security improvements for edge-deployed Deep Neural Networks (DNNs), proposing alternative obfuscation techniques that reduced vulnerability risk by 10%.\n"
     "- Evaluated Trusted Execution Environments (TEEs) and encryption methods to validate and enhance data security.\n"
     "- Developed obfuscation methods that decreased reverse engineering risk by 25%."),
    ("Software Developer", "Nextpart, Linz | Nov 2022 – Jul 2023",
     "- Enhanced a malicious analysis tool, improving threat detection accuracy by 20%.\n"
     "- Integrated external software connectors, boosting issue identification and evaluation by 30%.\n"
     "- Conducted vulnerability testing, increasing overall system resilience."),
    ("Cloud Consultant", "Deloitte, India | Aug 2021 – Feb 2022",
     "- Designed and implemented PII redaction and anonymization solutions, reducing exposure risk by 85%.\n"
     "- Engineered optimized AWS Fargate data pipelines, achieving a 30% processing speed increase and 20% cost reduction.\n"
     "- Collaborated with cross-functional teams to attain a 95% data privacy compliance rate."),
    ("Data Engineer", "Stats Perform, India | Mar 2020 – Jul 2021",
     "- Processed and analyzed over 10 TB of sports analytics data using AWS Glue, Spark, Redshift, and DynamoDB.\n"
     "- Integrated AWS services with external APIs, enhancing pipeline scalability by 60% and cutting data retrieval time by 35%.\n"
     "- Built resilient, dynamic-load handling ETL pipelines, improving system uptime by 25%."),
    ("Full-Stack Developer", "Infosys Pvt Ltd, India | Dec 2018 – Feb 2020",
     "- Led backend development for Belgium Post, reducing query response time by 50% and improving storage efficiency by 40% using Azure Blob Storage, SQL, and C#.\n"
     "- Developed dynamic, multilingual Angular UI to enhance user experience and accessibility.")
]
for role, company, details in experiences:
    doc.add_paragraph(f'{role}\n{company}\n{details}')

doc.add_heading('Skills:', level=1)
doc.add_paragraph('- Programming: Python (Pandas, NumPy), R, SQL, Scala\n'
                  '- Machine Learning: TensorFlow, scikit-learn, PyTorch, Keras, Random Forests, SVM, Gradient Boosting, Time Series Analysis, Hypothesis Testing\n'
                  '- Data Engineering: Hadoop, Spark, Kafka, MongoDB, Redshift, AWS Glue, S3\n'
                  '- Visualization: Tableau, Power BI, Excel, ggplot2, Matplotlib, Plotly\n'
                  '- Cloud Platforms: AWS (S3, Glue, EMR, SageMaker), Azure (Data Factory, Machine Learning)\n'
                  '- DevOps: Docker, Kubernetes')

doc.add_heading('Certifications:', level=1)
doc.add_paragraph('- AWS Certified Cloud Practitioner\n'
                  '- Microsoft Certified: Azure Developer Associate\n'
                  '- English (C1 Level)\n'
                  '- German (B1 Level)')

doc.add_heading('Projects:', level=1)
projects = [
    ("Testing Identity Theft",
     "Tools: Transfer Learning, Record Linkage, Differential Privacy, ARX Data Anonymization\n- Conducted ethical Record Linkage attacks to evaluate data vulnerability."),
    ("Reinforcement Learning-based Breakout Game",
     "Tools: Pygame, reward/penalty tuning, evaluated across thousands of iterations."),
    ("Sentiment Analysis on Twitter Feeds",
     "Tools: NLP (BERT, GPT), Tableau, NLTK, Python."),
    ("Influenza Prediction",
     "Tools: Pandas, NumPy, Scikit-learn, Matplotlib, Seaborn.")
]
for project, desc in projects:
    doc.add_paragraph(f'{project}\n{desc}')

doc.add_heading('Languages:', level=1)
doc.add_paragraph('- English (Fluent - C1)\n- German (Intermediate - B1)')

# Save Word Document
word_path = '/mnt/data/Reema_Dass_CV_Enhanced.docx'
doc.save(word_path)

# Create PDF
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 16)
        self.cell(0, 10, 'Reema Dass', ln=True, align='C')
        self.set_font('Arial', '', 12)
        self.cell(0, 10, 'Data Scientist | Data Engineer | Data Analyst', ln=True, align='C')
        self.ln(10)

pdf = PDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

sections = [
    ("Profile Summary", "Master's student in Data Science with four years of professional experience in data engineering, cloud computing, and machine learning applications. Skilled at transforming complex data into actionable insights through innovative, scalable solutions. Passionate about solving real-world problems using sophisticated analytics and driving meaningful impact across organizations."),
    ("Education", "Master's in Data Science\nTechnical University of Vienna, Austria | Mar 2022 – Present\n- Majoring in Machine Learning and Visual Analytics\n\nBachelor's in Computer Science\nVisvesvaraya Technological University (VTU), India | Sep 2013 – Dec 2017\n- Thesis: Image Processing for Medical Disease Diagnosis"),
    ("Professional Experience", "\n\n".join([f"{role}\n{company}\n{details}" for role, company, details in experiences])),
    ("Skills", "Programming: Python (Pandas, NumPy), R, SQL, Scala\nMachine Learning: TensorFlow, scikit-learn, PyTorch, Keras, Random Forests, SVM, Gradient Boosting, Time Series Analysis, Hypothesis Testing\nData Engineering: Hadoop, Spark, Kafka, MongoDB, Redshift, AWS Glue, S3\nVisualization: Tableau, Power BI, Excel, ggplot2, Matplotlib, Plotly\nCloud Platforms: AWS (S3, Glue, EMR, SageMaker), Azure (Data Factory, Machine Learning)\nDevOps: Docker, Kubernetes"),
    ("Certifications", "AWS Certified Cloud Practitioner\nMicrosoft Certified: Azure Developer Associate\nEnglish (C1 Level)\nGerman (B1 Level)"),
    ("Projects", "\n\n".join([f"{project}\n{desc}" for project, desc in projects])),
    ("Languages", "English (Fluent - C1)\nGerman (Intermediate - B1)")
]

for title, content in sections:
    pdf.set_font('Arial', 'B', 14)
    pdf.cell(0, 10, title, ln=True)
    pdf.set_font('Arial', '', 12)
    for line in content.split('\n'):
        pdf.multi_cell(0, 10, line)
    pdf.ln(5)

pdf_path = '/mnt/data/Reema_Dass_CV_Enhanced.pdf'
pdf.output(pdf_path)

word_path, pdf_path


ModuleNotFoundError: No module named 'fpdf'