In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('aws_connection') \
    .getOrCreate()

In [3]:
# Please update , Valid Keys,for test run uncommenting "result = invoke_claude_model_with_prompt(prompt)" in below cell
 
AWS_ACCESS_KEY_ID =" "
 
AWS_SECRET_ACCESS_KEY = " "
 
AWS_SESSION_TOKEN = " "

In [4]:
# claude Model, Tested Working, Never change this function
import boto3
import json

def invoke_claude_model_with_prompt(prompt):
    # Construct the payload with the provided prompt, Never Change
    payload = {
        "max_tokens": 5000,
        "messages": [{"role": "user", "content": prompt}],
        "anthropic_version": "bedrock-2023-05-31"
    }

    # Create a Boto3 session
    session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_session_token=AWS_SESSION_TOKEN
    )

    # Create a Bedrock Runtime client
    client = session.client('bedrock-runtime', region_name='us-east-1')

    # Invoke the model with the payload
    response = client.invoke_model(body=json.dumps(payload), modelId="anthropic.claude-3-sonnet-20240229-v1:0")

    # Parse the response body
    response_body = json.loads(response.get("body").read())

    # Return the content from the response
    return response_body.get("content")

# Example usage:
prompt = "Hello, world. What is the capital of India? Give the result in JSON format."
result = invoke_claude_model_with_prompt(prompt)
print(result)

[{'type': 'text', 'text': '{\n  "capital": "New Delhi"\n}'}]


In [5]:
# Read Tag CSV file into a DataFrame
tag_df = spark.read.csv("/home/jovyan/work/utility/tags.csv", header=True, inferSchema=True)
# Show the DataFrame schema and first few rows
tag_df.printSchema()
tag_df.show()
tag_df.count()
# Convert the column of interest to a list of strings
tag_list = tag_df.select("full_ancestral_name").rdd.flatMap(lambda x: x).collect()

# Join the list of strings
tag_str = ", ".join(tag_list)
print(len(tag_str))

word_count = len(tag_str.split())
print(f"tag_str word len -> {word_count}")


root
 |-- full_ancestral_name: string (nullable = true)

+--------------------+
| full_ancestral_name|
+--------------------+
|Software - Busine...|
|  Software - Ed Tech|
|  Software - FinTech|
|   Healthcare - HCIT|
|Software - IT Inf...|
|Software - Intern...|
| Software - MegaTech|
|   Software - Mobile|
| Software - Payments|
|Software - SaaS/C...|
| Software - Security|
|Internet - Advert...|
|CEO/BOD - General...|
|Internet - E-Comm...|
|  Internet - Ed Tech|
|        10/10 Gender|
|Internet - Market...|
| Internet - MegaTech|
|   Internet - Mobile|
|       CT - Biofuels|
+--------------------+
only showing top 20 rows

16572
tag_str word len -> 2238


In [6]:
import json

def read_json_file(file_path):
    """
    Read JSON objects from a JSON file.

    Args:
        file_path (str): The path of the JSON file to read.

    Returns:
        List: A list containing JSON objects read from the file.
    """
    json_objects = []
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Load JSON object from each line
            json_objects.append(json_obj)
    return json_objects

# Usage:
json_objects = read_json_file("input_people_data.json")
# Count the number of objects

print(json_objects[0])


{'person_id': 949256266, 'name': 'Vivian Weng', 'education': [{'institution_name': 'University of California, Berkeley - Walter A. Haas School of Business', 'degree': 'NaN', 'subject': 'MBA, Finance', 'started_on': '2007-01-01', 'ended_on': '2009-01-01'}, {'institution_name': 'National Taiwan University', 'degree': 'NaN', 'subject': 'BA, International Relations', 'started_on': '1998-01-01', 'ended_on': '2002-01-01'}, {'institution_name': 'Yale University', 'degree': 'NaN', 'subject': 'MA, International Developmental Economics', 'started_on': '2002-01-01', 'ended_on': '2003-01-01'}], 'employments': [{'company_name': 'Frog', 'seniority_level': 'Director', 'title': 'Associate Strategy Director, Innovation Strategy Group', 'position_description': 'NaN', 'started_on': '02/01/2012', 'ended_on': '06/01/2016'}, {'company_name': 'DBS Bank', 'seniority_level': 'VP', 'title': 'Vice President, Innovation Group', 'position_description': 'NaN', 'started_on': '07/01/2016', 'ended_on': '04/01/2019'}, 

In [11]:
import csv
import json
num_objects = len(json_objects)
print("Number of objects:", num_objects)

# Open a CSV file in write mode to store the tagged information (overwriting existing content)
with open('/home/jovyan/work/output_data/tagged_people.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Iterate over each JSON object in the list
    for i in range(num_objects):
        person_tagged = []

        person_id = json_objects[i]['person_id']
        name = json_objects[i]['name']
        Tags_assigned = "Tags_assigned"
        print(f"person_id->{person_id}")
        print(f"name->{name}")

        person_tagged.append(person_id)  # Add person_id to person_tagged list
        person_tagged.append(name)  # Add name to person_tagged list
        person_tagged.append(Tags_assigned)  # Add name to person_tagged list
        
        prompt = f"""
        From this person profile: {json_objects[i]}
        Find if they have experience in any of the following areas or domains and 
        return the result strictly always as JSON, no summary, only true cases. Areas belong to: {tag_str}
        """

        print("Model output")
        result = invoke_claude_model_with_prompt(prompt)
        
        json_text = result[0]['text']

        # Find the start of JSON data
        json_start = json_text.find('{')
        if json_start != -1:
            # Strip the unwanted text before JSON data
            json_text = json_text[json_start:]
        
            # Parse the JSON-formatted text into a dictionary
            try:
                text_dict = json.loads(json_text)
                print("Parsed dictionary:", text_dict)

                # Extract true cases from the dictionary
                true_cases = [key for key, value in text_dict.items() if value]
                print("True cases:", true_cases)

                # Write the tagged information to the CSV file
                writer.writerow(person_tagged)
                for case in true_cases:
                    writebuff = [None, None, case]
                    writer.writerow(writebuff)

            except json.JSONDecodeError:
                print("Error: The extracted text is not in a valid JSON format.")

Number of objects: 999
person_id->949256266
name->Vivian Weng
Model output
Parsed dictionary: {'Financial Services': True, 'Services - Management Consulting': True, 'Internet - Advertising Tech': True, 'Software - Big Data': True}
True cases: ['Financial Services', 'Services - Management Consulting', 'Internet - Advertising Tech', 'Software - Big Data']


In [None]:
print(" ***********Completed ***************")