# Resume -> JSON with Template Generation and Chaining

### fill .env file with ANTHROPIC_API_KEY=sk_
### for Colab, upload .env to Google Colab in Files

In [None]:
!pip install -q python-dotenv
import os
from dotenv import load_dotenv
load_dotenv()

!pip install -Uqq ipdb
import ipdb

import json
!pip install -q json_repair
from json_repair import repair_json
!pip install -q genson
import genson

!pip install -q pymupdf
import fitz

!pip install -q markdownify
import markdownify

# print(os.environ['ANTHROPIC_API_KEY'])
!pip install -q anthropic
import anthropic

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.6 MB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.6 MB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.6 MB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m1.1/1.6 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m1.5/1.6 MB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for genson (s

In [None]:
class Agent(anthropic.Anthropic):
    client = None
    model = "claude-3-haiku-20240307"
    max_tokens = 3500
    prompt = None  # for debug
    schema = None
    tools = []
    response = None  # for debug

    def set_model(self, model=None):
        if model is None:
            model = self.model
        allowed_models = ["claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"]
        assert model in allowed_models, f"model %s not in allowed models={allowed_models}"
        self.model = model
        self.client = anthropic.Anthropic()
        return self

    @staticmethod
    def get_json_tool(schema):
        return {
            "name": "JSON",
            "description": "Convert requested input or fill schema into strict JSON.",
            "input_schema": schema,
        }

    def set_schema(self, schema):
        self.schema = schema
        if self.schema:
            self.tools = [self.get_json_tool(schema)]
        else:
            self.tools = []
        return self

    def set_tools(self, tools):
        self.tools = tools
        return self

    def get_response(self, prompt="Give a comprehensive and highly detailed example employee profile.", schema=None):
        if schema is not None:
            self.set_schema(schema)

        assert self.client is not None, "Client not set, call set_model()"
        have_tools = len(self.tools) > 0
        if have_tools:
            response_func = self.client.beta.tools.messages.create
            model_kwargs = dict(tools=self.tools)
        else:
            response_func = self.client.messages.create
            model_kwargs = dict()

        if any(x['name'] == 'JSON' for x in self.tools):
            # print("Tool use")
            pre_prompt = f"""  Always absolutely give your response in valid JSON using this schema:
<schema>
{self.schema}
</schema>
Assume all keys in the JSON schema are required, ensure all values are filled with information (do not leave schema descriptions).
Include some brief thinking in <thinking> </thinking> xml tags, but do not generate the JSON inside the thinking block (i.e. stick to natural language and general considerations inside the thinking block).
Then, absolutely ensure you use the JSON tool to respond to handle the prompt:"""
            post_prompt = "Remember, absolutely ensure you use the JSON tool to respond to the above prompt."
            prompt = pre_prompt + '\n<prompt>\n' + prompt + '\n</prompt>' + '\n' + post_prompt
        self.prompt = prompt

        response = response_func(
            model=self.model,
            max_tokens=self.max_tokens,
            messages=[
                {"role": "user", "content": prompt}
            ],
            **model_kwargs,
        )

        self.response = response
        if hasattr(response.content[0], 'text'):
            text_response = response.content[0].text
        else:
            text_response = None
        last_response = response.content[-1]
        if have_tools:
            assert self.response.content[-1].type == 'tool_use', "expected tool use: %s %s" % (self.response.content[-1].type, last_response)
            assert response.stop_reason in ['end_turn', 'tool_use'], "expected tool use or end of turn in some cases: %s %s" % (response.stop_reason, last_response)
        tool_response = response.content[-1].input if have_tools else {}
        if 'object' in tool_response and len(tool_response) == 1:
            tool_response = tool_response['object']
        if 'properties' in tool_response and 'type' in tool_response and len(tool_response) == 2:
            tool_response = tool_response['properties']
        return dict(tool_response=tool_response, text_response=text_response)

    @staticmethod
    def check_json(obj):
        assert obj == json.loads(repair_json(json.dumps(obj)))

    @staticmethod
    def json_to_schema(obj):
        from genson import SchemaBuilder
        builder = SchemaBuilder()
        builder.add_object(obj)
        new_schema = builder.to_schema()
        return new_schema

    @staticmethod
    def pretty_json_print(obj):
        new_schema_str = json.dumps(obj, indent=2)
        print(new_schema_str)

    @staticmethod
    def pdf_to_text(file):
        with fitz.open(file) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            #meta = doc.metadata
        return text

    @staticmethod
    def url_to_md(url):
        import requests
        r = requests.get(url, allow_redirects=True)
        html = r.content
        md = markdownify.markdownify(html)
        # print(job_posting_md)
        return md

# Manual Schema

In [None]:
schema = {
    "type": "object",
    "properties": {
        "name": {
            "type": "string"
        },
        "age": {
            "type": "integer"
        },
        "skills": {
            "type": "array",
            "items": {
                "type": "string",
                "maxLength": 10
            },
            "minItems": 3
        },
        "workhistory": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "company": {
                        "type": "string"
                    },
                    "duration": {
                        "type": "string"
                    },
                    "position": {
                        "type": "string"
                    }
                },
                "required": ["company", "position"]
            }
        }
    },
    "required": ["name", "age", "skills", "workhistory"]
}

In [None]:
prompt = "Give a comprehensive and highly detailed example employee profile."
agent = Agent()
job_json = agent.set_model().set_schema(schema).get_response(prompt)['tool_response']
agent.check_json(job_json)
agent.pretty_json_print(job_json)

{
  "name": "Emily Gonzalez",
  "age": 32,
  "skills": [
    "project management",
    "data analysis",
    "team leadership"
  ],
  "workhistory": [
    {
      "company": "Acme Corp",
      "duration": "3 years",
      "position": "Marketing Coordinator"
    },
    {
      "company": "Widgets Inc",
      "duration": "5 years",
      "position": "Product Manager"
    }
  ]
}


### Automatic Schema

In [None]:
url = 'https://openai.com/careers/research-scientist'
job_posting_md = agent.url_to_md(url)
prompt = f"<job>{job_posting_md}</job>\nGenerate an employee candidate JSON schema that would capture the relevant information from a user resume. \
           Focus on a JSON schema that would extract information from the resume that would be used to identify the best candiate for the job posting given above. \
           Ensure the JSON schema is not too deeply nested with no more than 10 keys. \
           Ensure the schema also includes verbose descriptions for each item. \
           Ensure all property keys only include alphanumeric, underscore, or hyphen, and keys are less than 64 characters. \
           Only generate strict valid JSON starting with {{"
text_response = agent.set_tools([]).get_response(prompt)['text_response']
print(text_response)
job_schema = json.loads(text_response)
agent.check_json(job_schema)
agent.pretty_json_print(job_schema)

{
  "full_name": {
    "description": "The full name of the candidate",
    "type": "string"
  },
  "email": {
    "description": "The email address of the candidate",
    "type": "string",
    "format": "email"
  },
  "phone_number": {
    "description": "The phone number of the candidate",
    "type": "string"
  },
  "location": {
    "description": "The current location of the candidate",
    "type": "string"
  },
  "education": {
    "description": "The educational background of the candidate",
    "type": "array",
    "items": {
      "type": "object",
      "properties": {
        "degree": {
          "description": "The degree obtained by the candidate",
          "type": "string"
        },
        "institution": {
          "description": "The institution where the degree was obtained",
          "type": "string"
        },
        "graduation_year": {
          "description": "The year the degree was obtained",
          "type": "integer"
        }
      }
    }
  },
  "work

# Apply Automatic Schema

In [None]:
schema = {
    "type": "object",
    "properties": job_schema,
}

pdf_file = 'resume.pdf'
import requests
with open(pdf_file, 'wb') as f:
    f.write(requests.get('https://danijar.com/material/danijar-hafner-resume.pdf').content)

resume_text = agent.pdf_to_text(pdf_file)
prompt = f"<resume>{resume_text}</resume>\nTranscribe the resume given to valid JSON."

resume_json = agent.set_schema(schema).get_response(prompt)['tool_response']
agent.check_json(resume_json)
agent.pretty_json_print(resume_json)

{
  "education": [
    {
      "degree": "PhD",
      "institution": "University of Toronto",
      "graduation_year": 2023
    },
    {
      "degree": "MRes",
      "institution": "University College London",
      "graduation_year": 2018
    },
    {
      "degree": "BSc",
      "institution": "Hasso Plattner Institute",
      "graduation_year": 2017
    }
  ],
  "email": "mail@danijar.com",
  "full_name": "Danijar Hafner",
  "location": "Google Scholar, Website, Github, Twitter",
  "phone_number": "Google Scholar, Website, Github, Twitter",
  "publications": [
    {
      "title": "Deep Hierarchical Planning from Pixels",
      "venue": "NeurIPS 2022",
      "year": 2022
    },
    {
      "title": "Learning Robust Dynamics through Variational Sparse Gating",
      "venue": "NeurIPS 2022",
      "year": 2022
    },
    {
      "title": "World Models for Physical Robot Learning",
      "venue": "CoRL 2022",
      "year": 2022
    },
    {
      "title": "Masked World Models for Visu