In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('aws_connection') \
    .getOrCreate()

In [3]:
# Please update , Valid Keys,for test run uncommenting "result = invoke_claude_model_with_prompt(prompt)" in below cell
AWS_ACCESS_KEY_ID =" "
AWS_SECRET_ACCESS_KEY = " "
AWS_SESSION_TOKEN = " "

In [4]:
# claude Model, Tested Working, Never change this function
import boto 3
import json

def invoke_claude_model_with_prompt(prompt):
    # Construct the payload with the provided prompt, Never Change
    payload = {
        "max_tokens": 5000,
        "messages": [{"role": "user", "content": prompt}],
        "anthropic_version": "bedrock-2023-05-31"
    }

    # Create a Boto3 session
    session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_session_token=AWS_SESSION_TOKEN
    )

    # Create a Bedrock Runtime client
    client = session.client('bedrock-runtime', region_name='us-east-1')

    # Invoke the model with the payload
    response = client.invoke_model(body=json.dumps(payload), modelId="anthropic.claude-3-sonnet-20240229-v1:0")

    # Parse the response body
    response_body = json.loads(response.get("body").read())

    # Return the content from the response
    return response_body.get("content")

# Example usage:
prompt = "Hello, world. What is the capital of India? Give the result in JSON format."
result = invoke_claude_model_with_prompt(prompt)
print(result)

[{'type': 'text', 'text': '{\n  "capital": "New Delhi"\n}'}]


In [5]:
# Read Tag CSV file into a DataFrame
tag_df = spark.read.csv("/home/jovyan/work/utility/tags.csv", header=True, inferSchema=True)
# Show the DataFrame schema and first few rows
tag_df.printSchema()
tag_df.show()
tag_df.count()
# Convert the column of interest to a list of strings
tag_list = tag_df.select("full_ancestral_name").rdd.flatMap(lambda x: x).collect()

# Join the list of strings
tag_str = ", ".join(tag_list)
print(len(tag_str))

word_count = len(tag_str.split())
print(f"tag_str word len -> {word_count}")


root
 |-- full_ancestral_name: string (nullable = true)

+--------------------+
| full_ancestral_name|
+--------------------+
|Software - Busine...|
|  Software - Ed Tech|
|  Software - FinTech|
|   Healthcare - HCIT|
|Software - IT Inf...|
|Software - Intern...|
| Software - MegaTech|
|   Software - Mobile|
| Software - Payments|
|Software - SaaS/C...|
| Software - Security|
|Internet - Advert...|
|CEO/BOD - General...|
|Internet - E-Comm...|
|  Internet - Ed Tech|
|        10/10 Gender|
|Internet - Market...|
| Internet - MegaTech|
|   Internet - Mobile|
|       CT - Biofuels|
+--------------------+
only showing top 20 rows

16572
tag_str word len -> 2238


In [6]:
# Read CSV file, for people data
import pandas as pd
import os

cwd = os.getcwd()
print("Current working directory:", cwd)

# Read the CSV file into a pandas DataFrame
all_rows_df = pd.read_csv('/home/jovyan/work/scripts/input_people_data.csv')

# Print the DataFrame , for one row
print(all_rows_df.iloc[0]['plain_text'])

Current working directory: /home/jovyan/work/scripts
person_id:23678899 name:Bill Murphy address:Culver City California US description:Versatile management professional with 15 years of project management experience managing highly visible complex projects with aggressive timelines managing multidiscipline high performance teams and providing technical experience and leadership in project operations and logistics managementnnProven record of success analyzing all aspects of a project including investigating and determining if monies are being spent appropriately reducing expenses improving the efficiency of operations and monitoring key program deliverables created_at:20231211 06:18:0813904 updated_at:20231026 15:57:2235 education: Santa Fe University Of Art And Design BachelorsBachelor Of Arts 19920101 19960101 employments: VFX Producer 20040101 20070101 2289 Netflix Sr Manager VFX Infrastructure Technology Operations 20170701 20191101 877281030000 15635 NFLX PostIPO Debt Saul Bisht 2

In [7]:
# prompt preparation for model ,saves result to tagged_people2.csv 
import json
import csv

# Open a CSV file in append mode to store the tagged information
with open('tagged_people2.csv', 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Iterate over the first few rows in the DataFrame
    for index, row in all_rows_df.iterrows():
        extracted_values = []
        person_tagged =[]
        plain_text = row['plain_text']

        person_id = plain_text.split(' ')[0].split(':')[1]
        name = plain_text.split('name:')[1].split(' ')[0]

        # Appending the extracted values to the list
        extracted_values.extend([person_id, name])

        # Construct the prompt string for each row
        prompt = f"""
        From this person profile: {row['plain_text']}
        Find if he has experience in any of the following areas or domains and return the result strictly always as JSON,no summary, only true cases.
        Areas belong to: {tag_str}
        """
        #print(prompt)        
        # Print or use the prompt as needed
        print(f"**************Model output*************\n")
        result = invoke_claude_model_with_prompt(prompt)
        print(result)

        # Ensure that the result is a list containing a dictionary
        if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict) and 'text' in result[0]:
            # Extract the JSON-formatted text from the dictionary
            json_text = result[0]['text']
            
            # Remove the triple backticks from the string
            json_text = json_text.strip('```')
            
            # Parse the JSON-formatted text into a dictionary
            try:
                text_dict = json.loads(json_text)
                
                # Extract the true cases from the dictionary
                true_cases = [key for key, value in text_dict.items() if value]
                
                person_tagged = extracted_values + true_cases
                # Write the tagged information to the CSV file
                #writer.writerow(person_tagged)
                
                print(f"Tagged as-> {person_tagged}")

            except json.JSONDecodeError:
                print("The extracted text is not in a valid JSON format.")
        
        else:
            print("The result is not in the expected format.")
        
        print(f"Model Tagged as-> {person_tagged}")
        writer.writerow(person_tagged) 

**************Model output*************

[{'type': 'text', 'text': '{\n  "Software - IT Infrastructure": true,\n  "Internet - Video": true,\n  "Software - Storage": true,\n  "Software - Networking": true\n}'}]
Tagged as-> ['23678899', 'Bill', 'Software - IT Infrastructure', 'Internet - Video', 'Software - Storage', 'Software - Networking']
Model Tagged as-> ['23678899', 'Bill', 'Software - IT Infrastructure', 'Internet - Video', 'Software - Storage', 'Software - Networking']
**************Model output*************

[{'type': 'text', 'text': '{\n  "Software - Artificial Intelligence": true,\n  "Software - Analytics": true,\n  "Marketing": true,\n  "Marketing Automation": true\n}'}]
Tagged as-> ['22269279', 'Maggie', 'Software - Artificial Intelligence', 'Software - Analytics', 'Marketing', 'Marketing Automation']
Model Tagged as-> ['22269279', 'Maggie', 'Software - Artificial Intelligence', 'Software - Analytics', 'Marketing', 'Marketing Automation']
**************Model output**********