In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('aws_connection') \
    .getOrCreate()

In [3]:
# Please update , Valid Keys,for test run uncommenting "result = invoke_claude_model_with_prompt(prompt)" in below cell
 
AWS_ACCESS_KEY_ID =" "
 
AWS_SECRET_ACCESS_KEY = " "
 
AWS_SESSION_TOKEN = " "

In [4]:
# claude Model, Tested Working, Never change this function
import boto3
import json

def invoke_claude_model_with_prompt(prompt):
    # Construct the payload with the provided prompt, Never Change
    payload = {
        "max_tokens": 5000,
        "messages": [{"role": "user", "content": prompt}],
        "anthropic_version": "bedrock-2023-05-31"
    }

    # Create a Boto3 session
    session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_session_token=AWS_SESSION_TOKEN
    )

    # Create a Bedrock Runtime client
    client = session.client('bedrock-runtime', region_name='us-east-1')

    # Invoke the model with the payload
    response = client.invoke_model(body=json.dumps(payload), modelId="anthropic.claude-3-sonnet-20240229-v1:0")

    # Parse the response body
    response_body = json.loads(response.get("body").read())

    # Return the content from the response
    return response_body.get("content")

# Example usage:
prompt = "Hello, world. What is the capital of India? Give the result in JSON format."
result = invoke_claude_model_with_prompt(prompt)
print(result)

[{'type': 'text', 'text': '{\n  "capital": "New Delhi"\n}'}]


In [5]:
# Read Tag CSV file into a DataFrame
tag_df = spark.read.csv("/home/jovyan/work/utility/tags.csv", header=True, inferSchema=True)
# Show the DataFrame schema and first few rows
tag_df.printSchema()
tag_df.show()
tag_df.count()
# Convert the column of interest to a list of strings
tag_list = tag_df.select("full_ancestral_name").rdd.flatMap(lambda x: x).collect()

# Join the list of strings
tag_str = ", ".join(tag_list)
print(len(tag_str))

word_count = len(tag_str.split())
print(f"tag_str word len -> {word_count}")


root
 |-- full_ancestral_name: string (nullable = true)

+--------------------+
| full_ancestral_name|
+--------------------+
|Software - Busine...|
|  Software - Ed Tech|
|  Software - FinTech|
|   Healthcare - HCIT|
|Software - IT Inf...|
|Software - Intern...|
| Software - MegaTech|
|   Software - Mobile|
| Software - Payments|
|Software - SaaS/C...|
| Software - Security|
|Internet - Advert...|
|CEO/BOD - General...|
|Internet - E-Comm...|
|  Internet - Ed Tech|
|        10/10 Gender|
|Internet - Market...|
| Internet - MegaTech|
|   Internet - Mobile|
|       CT - Biofuels|
+--------------------+
only showing top 20 rows

16572
tag_str word len -> 2238


In [6]:
# Read CSV file, for people data
import pandas as pd
import os

cwd = os.getcwd()
print("Current working directory:", cwd)

# Read the CSV file into a pandas DataFrame
all_rows_df = pd.read_csv('/home/jovyan/work/scripts/input_people_data.csv')

# Print the DataFrame , for one row
print(all_rows_df.iloc[0]['plain_text'])

Current working directory: /home/jovyan/work/scripts
person_id:949256266 name:Vivian Weng address:NaN NaN NaN NaN SINGAPORE headline:Product Design Strategy Managment description:NaN created_at:20231211 06:18:0813904 updated_at:NaN customer_id:7538cb111c234c7687c75111a0f166dc education:NaN University of California Berkeley Walter A Haas School of Business NaN MBA Finance 20070101 20090101 NaN National Taiwan University NaN BA International Relations 19980101 20020101 NaN Yale University NaN MA International Developmental Economics 20020101 20030101 employments:79621720 Frog Director Associate Strategy Director Innovation Strategy Group 20120201 20160601 158610 DBS Bank VP Vice President Innovation Group 20160701 20190401 16600 Apple NaN Product Design Producer International Product Design 20190401 NaN 31949031445E10 224704 AAPL Private Equity 60840 Interscope Records 590 Goldman Sachs NaN Investment Banking Division Summer Associate 20080601 20080801 8417546311E9 75859 GS 40070 McKinse

In [None]:
# prompt preparation for model ,saves result to tagged_people2.csv 
import json
import csv

# Open a CSV file in append mode to store the tagged information
# base_path = '/home/jovyan/work/output_data/'
with open('/home/jovyan/work/output_data/tagged_people.csv', 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Iterate over the first few rows in the DataFrame
    for index, row in all_rows_df.iterrows():
        extracted_values = []
        person_tagged =[]
        plain_text = row['plain_text']

        person_id = plain_text.split(' ')[0].split(':')[1]
        name = plain_text.split('name:')[1].split(' ')[0]

        # Appending the extracted values to the list
        extracted_values.extend([person_id, name])

        # Construct the prompt string for each row
        prompt = f"""
        From this person profile: {row['plain_text']}
        Find if he has experience in any of the following areas or domains and return the result strictly always as JSON,no summary, only true cases.
        Areas belong to: {tag_str}
        """
        #print(prompt)        
        # Print or use the prompt as needed
        print(f"**************Model output*************\n")
        result = invoke_claude_model_with_prompt(prompt)
        print(result)

        # Ensure that the result is a list containing a dictionary
        if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict) and 'text' in result[0]:
            # Extract the JSON-formatted text from the dictionary
            json_text = result[0]['text']
            
            # Remove the triple backticks from the string
            json_text = json_text.strip('```')
            
            # Parse the JSON-formatted text into a dictionary
            try:
                text_dict = json.loads(json_text)
                
                # Extract the true cases from the dictionary
                true_cases = [key for key, value in text_dict.items() if value]
                
                person_tagged = extracted_values + true_cases
                # Write the tagged information to the CSV file
                #writer.writerow(person_tagged)
                
                print(f"Tagged as-> {person_tagged}")

            except json.JSONDecodeError:
                print("The extracted text is not in a valid JSON format.")
        
        else:
            print("The result is not in the expected format.")
        
        print(f"Model Tagged as-> {person_tagged}")
        writer.writerow(person_tagged) 

**************Model output*************

[{'type': 'text', 'text': '{\n  "Software - Business Application": false,\n  "Software - Ed Tech": false,\n  "Software - FinTech": false,\n  "Healthcare - HCIT": false,\n  "Software - IT Infrastructure": false,\n  "Software - Internet of Things": false,\n  "Software - MegaTech": false,\n  "Software - Mobile": true,\n  "Software - Payments": false,\n  "Software - SaaS/Cloud": false,\n  "Software - Security": false,\n  "Internet - Advertising Tech": false,\n  "CEO/BOD - General Manager": false,\n  "Internet - E-Commerce": true,\n  "Internet - Ed Tech": false,\n  "10/10 Gender": false,\n  "Internet - Marketplace": false,\n  "Internet - MegaTech": false,\n  "Internet - Mobile": true,\n  "CT - Biofuels": false,\n  "Internet - Travel": false,\n  "Investor - Buy-out": false,\n  "Investor - Growth": false,\n  "Investor - Venture": false,\n  "Healthcare - Devices/ Diagnostics": false,\n  "Healthcare - Biotech/ Pharma": false,\n  "Services - MSO": false,\