In [13]:
import os
import json
from openai import OpenAI
import pandas as pd

# Load your OpenAI API key
# openai.api_key = 'YOUR_OPENAI_API_KEY'

client = OpenAI()


class PDFUserProfileProcessor:
    def __init__(self, pdf_name, base_extraction_dir='pdf_extractions'):
        self.pdf_name = pdf_name
        self.base_dir = os.path.join(base_extraction_dir, self.pdf_name)
        self.text_dir = os.path.join(self.base_dir, "text")
        self.image_dir = os.path.join(self.base_dir, "images")
        self.table_dir = os.path.join(self.base_dir, "tables")
        self.text_file = os.path.join(self.text_dir, f"{self.pdf_name}.txt")
        
        self.profile = {
            "name": "",
            "email": "",
            "phone": "",
            "location": "",
            "professional_skills": [],
            "organizations": [],
            "hobbies": [],
            "favorite_activities": [],
            "personal_interests": [],
            "daily_routines": {
                "morning": "",
                "evening": "",
                "exercise_and_health": "",
            },
            "social_media_profiles": [],
            "friends_and_family": [],
            "favorite_books_articles_authors": [],
            "favorite_movies_shows_genres": [],
            "music_preferences": [],
            "preferred_communication_methods": [],
            "writing_style_and_tone": "",
            "goals_and_aspirations": {
                "short_term": [],
                "long_term": [],
            },
            "personal_projects": [],
        }

    def query_gpt(self, prompt):    


        """Query OpenAI GPT with the given prompt and return the response."""
        response = client.chat.completions.create(
            
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": """You are an assistant who is helping me extract all the profile information from the text passed. Extract the following profile information from the text:
                        1. Name
                        2. Email
                        3. Phone number
                        4. Location
                        5. Professional skills
                        6. Organizations
                        7. Hobbies
                        8. Favorite activities
                        9. Personal interests
                        10. Daily routines
                        11. Social media profiles
                        12. Friends and family
                        13. Favorite books, articles, and authors
                        14. Favorite movies, shows, and genres
                        15. Music preferences
                        16. Preferred communication methods
                        17. Writing style and tone
                        18. Short-term goals
                        19. Long-term goals
                        20. Personal projects.
                        21. Other miscellaneous information
                 
                 
                 Make sure you give the final output which can be parsed into a json with these keys as their attribute as I'm storing user profile as a dictionary."""},
                {"role": "user", "content": prompt},
            ],
            response_format={ "type": "json_object" },
        )
        print("*"*100)

        print("Checking what is being returned  -------- response.choices[0].message.content")
        print(response.choices[0].message.content)
        print("*"*100)

        return response.choices[0].message.content

    def extract_personal_info_with_gpt(self, text):
        """Extract personal information using GPT."""
        prompt = f"""

        Text: {text}
        Information:
        """
        response = self.query_gpt(prompt)
        return json.loads(response)

    def process_text(self):
        with open(self.text_file, 'r', encoding='utf-8') as file:
            text_content = file.read()
        extracted_info = self.extract_personal_info_with_gpt(text_content)
        self.update_profile(extracted_info)

    def process_images(self):
        image_paths = [os.path.join(self.image_dir, f) for f in os.listdir(self.image_dir) if os.path.isfile(os.path.join(self.image_dir, f))]
        for image_path in image_paths:
            # Here we assume that the LLM can process image descriptions
            response = self.query_gpt(f"Extract profile information from this image: {image_path}")
            extracted_info = json.loads(response)
            self.update_profile(extracted_info)

    def process_tables(self):
        table_paths = [os.path.join(self.table_dir, f) for f in os.listdir(self.table_dir) if os.path.isfile(os.path.join(self.table_dir, f))]
        for table_path in table_paths:
            with open(table_path, 'r') as file:
                table_content = file.read()
            response = self.query_gpt(f"Extract profile information from this table:\n{table_content}")
            extracted_info = json.loads(response)
            self.update_profile(extracted_info)
    

    def update_profile(self, extracted_info):
        for key, value in extracted_info.items():
            if key.lower() in self.profile:
                profile_key = key.lower()
            else:
                profile_key = key.replace(" ", "_").lower()

            if profile_key in self.profile:
                if isinstance(self.profile[profile_key], list):
                    self.profile[profile_key].extend(value if isinstance(value, list) else [value])
                elif isinstance(self.profile[profile_key], dict):
                    self.profile[profile_key].update(value)
                else:
                    self.profile[profile_key] = value
            else:
                print(f"Warning: Key '{profile_key}' not found in profile structure. Skipping.")


    # def update_profile(self, extracted_info):
    #     for key, value in extracted_info.items():
    #         if isinstance(self.profile[key], list):
    #             self.profile[key].extend(value)
    #         elif isinstance(self.profile[key], dict):
    #             for sub_key, sub_value in value.items():
    #                 self.profile[key][sub_key] = sub_value
    #         else:
    #             self.profile[key] = value

    def build_user_profile(self):
        self.process_text()
        # self.process_images()
        # self.process_tables()
        # Remove duplicates in lists
        for key in self.profile.keys():
            if isinstance(self.profile[key], list):
                self.profile[key] = list(set(self.profile[key]))
        # Save profile to JSON file
        profile_file = os.path.join(self.base_dir, "user_profile.json")
        with open(profile_file, 'w', encoding='utf-8') as file:
            json.dump(self.profile, file, indent=4)
        print(f"User profile saved to '{profile_file}'.")


In [14]:
# Usage example
pdf_name = "pranay"
pdf_user_profile_processor = PDFUserProfileProcessor(pdf_name)
pdf_user_profile_processor.build_user_profile()

****************************************************************************************************
Checking what is being returned  -------- response.choices[0].message.content
{
    "Name": "Pranay",
    "Email": "pranayr@umass.edu",
    "Phone number": "Not found",
    "Location": "University of Massachusetts Amherst",
    "Professional skills": "Deep Learning, Computer Vision",
    "Organizations": ["Meta Reality Labs", "AirLab", "Robotics Institute at Carnegie Mellon University", "Indian School of Business Hyderabad"],
    "Hobbies": "Not found",
    "Favorite activities": "Research in Computer Science",
    "Personal interests": "Exploring the intersection of Deep Learning and Computer Vision",
    "Daily routines": "Not found",
    "Social media profiles": "Not found",
    "Friends and family": "Not found",
    "Favorite books, articles, and authors": "Not found",
    "Favorite movies, shows, and genres": "Not found",
    "Music preferences": "Not found",
    "Preferred communi

ValueError: dictionary update sequence element #0 has length 1; 2 is required

In [37]:


import openai
import json

# Load your OpenAI API key
# openai.api_key = 'YOUR_OPENAI_API_KEY'

def query_gpt(prompt):
    """Query OpenAI GPT with the given prompt and return the response."""
    response = client.chat.completions.create(
            
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": """You are an assistant who is helping me extract all the profile information from the text passed. Extract the following profile information from the text:
                        1. Name
                        2. Email
                        3. Phone
                        4. Location
                        5. Professional skills
                        6. Organizations
                        7. Hobbies
                        8. Favorite activities
                        9. Personal interests
                        10. Daily routines
                        11. Social media profiles
                        12. Friends and family
                        13. Favorite books, articles, and authors
                        14. Favorite movies, shows, and genres
                        15. Music preferences
                        16. Preferred communication methods
                        17. Writing style and tone
                        18. Short-term goals
                        19. Long-term goals
                        20. Personal projects.
                        21. Other miscellaneous information
                 
                 
                 Make sure you give the final output which can be parsed into a json with these keys as their attribute as I'm storing user profile as a dictionary."""},
                {"role": "user", "content": prompt},
            ],
            response_format={ "type": "json_object" },
        )
    print("*"*100)

    print("Checking what is being returned  -------- response.choices[0].message.content")
    print(response.choices[0].message.content)
    print("*"*100)

    return response.choices[0].message.content

def extract_personal_info_with_gpt(text):
    """Extract personal information using GPT."""
    prompt = f"""
    Text: {text}
    Information:
    """
    response = query_gpt(prompt)
    print("GPT Response:", response)
    return response, json.loads(response)


In [38]:



def process_text(text_file, profile):
    with open(text_file, 'r', encoding='utf-8') as file:
        text_content = file.read()
    extracted_info, json_info = extract_personal_info_with_gpt(text_content)
    return extracted_info, json_info

In [39]:
import os

profile = {
            "name": "",
            "email": "",
            "phone": "",
            "location": "",
            "professional_skills": [],
            "organizations": [],
            "hobbies": [],
            "favorite_activities": [],
            "personal_interests": [],
            "daily_routines": {
                "morning": "",
                "evening": "",
                "exercise_and_health": "",
            },
            "social_media_profiles": [],
            "friends_and_family": [],
            "favorite_books_articles_authors": [],
            "favorite_movies_shows_genres": [],
            "music_preferences": [],
            "preferred_communication_methods": [],
            "writing_style_and_tone": "",
            "goals_and_aspirations": {
                "short_term": [],
                "long_term": [],
            },
            "personal_projects": [],
        }
text_file = 'pdf_extractions/pranay/text/pranay.txt'
extracted_info, json_info = process_text(text_file, profile)

****************************************************************************************************
Checking what is being returned  -------- response.choices[0].message.content
{
    "Name": "Pranay Reddy",
    "Email": "pranayr@umass.edu",
    "Phone": "Not available",
    "Location": "University of Massachusetts Amherst",
    "Professional skills": ["Deep Learning", "Computer Vision", "Few Shot Learning"],
    "Organizations": ["Meta Reality Labs", "AirLab", "Robotics Institute at Carnegie Mellon University", "Indian School of Business Hyderabad"],
    "Hobbies": [],
    "Favorite activities": [],
    "Personal interests": [],
    "Daily routines": [],
    "Social media profiles": [],
    "Friends and family": [],
    "Favorite books, articles, and authors": [],
    "Favorite movies, shows, and genres": [],
    "Music preferences": [],
    "Preferred communication methods": ["Email"],
    "Writing style and tone": "Professional",
    "Short-term goals": ["Engage in research project

In [27]:
extracted_info, type(extracted_info)

('{\n    "Name": "Pranay",\n    "Email": "pranayr@umass.edu",\n    "Phone number": "Not found",\n    "Location": "University of Massachusetts Amherst",\n    "Professional skills": "Deep Learning, Computer Vision, Few Shot Learning",\n    "Organizations": ["Meta Reality Labs", "AirLab", "Robotics Institute at Carnegie Mellon University", "Indian School of Business Hyderabad"],\n    "Hobbies": "Not found",\n    "Favorite activities": "Researching, Collaborating on projects",\n    "Personal interests": "Exploring the crossroads of Deep Learning and Computer Vision",\n    "Daily routines": "Not found",\n    "Social media profiles": "Not shared",\n    "Friends and family": "Not found",\n    "Favorite books, articles, and authors": "Not found",\n    "Favorite movies, shows, and genres": "Not found",\n    "Music preferences": "Not found",\n    "Preferred communication methods": "Not found",\n    "Writing style and tone": "Academic and professional",\n    "Short-term goals": "Participating in 

In [26]:
json_info, type(json_info)

({'Name': 'Pranay',
  'Email': 'pranayr@umass.edu',
  'Phone number': 'Not found',
  'Location': 'University of Massachusetts Amherst',
  'Professional skills': 'Deep Learning, Computer Vision, Few Shot Learning',
  'Organizations': ['Meta Reality Labs',
   'AirLab',
   'Robotics Institute at Carnegie Mellon University',
   'Indian School of Business Hyderabad'],
  'Hobbies': 'Not found',
  'Favorite activities': 'Researching, Collaborating on projects',
  'Personal interests': 'Exploring the crossroads of Deep Learning and Computer Vision',
  'Daily routines': 'Not found',
  'Social media profiles': 'Not shared',
  'Friends and family': 'Not found',
  'Favorite books, articles, and authors': 'Not found',
  'Favorite movies, shows, and genres': 'Not found',
  'Music preferences': 'Not found',
  'Preferred communication methods': 'Not found',
  'Writing style and tone': 'Academic and professional',
  'Short-term goals': 'Participating in conferences and continuing research projects',
  

In [42]:
profile = {
    "Name": "",
    "Email": "",
    "Phone": "",
    "Location": "",
    "Professional skills": [],
    "Organizations": [],
    "Hobbies": [],
    "Favorite activities": [],
    "Personal interests": [],
    "Daily routines": {
        "Morning": "",
        "Evening": "",
        "Exercise and health": "",
    },
    "Social media profiles": [],
    "Friends and family": [],
    "Favorite books, articles, and authors": [],
    "Favorite movies, shows, and genres": [],
    "Music preferences": [],
    "Preferred communication methods": [],
    "Writing style and tone": "",
    "Short-term goals": [],
    "Long-term goals": [],
    # "Goals and aspirations": {
    #     "Short-term": [],
    #     "Long-term": [],
    # },
    "Personal projects": [],
    "Other miscellaneous information": []
}


def update_profile(profile, info):
    print(info.keys())
    print(profile.keys())
    for key, value in info.items():
        
        if key in profile:
            if isinstance(profile[key], list):
                profile[key].extend(value if isinstance(value, list) else [value])
            elif isinstance(profile[key], dict):
                profile[key].update(value)
            else:
                profile[key] = value
        else:
            print(f"Warning: Key '{key}' not found in profile structure. Skipping.")

# print("Original Profile,", profile)
update_profile(profile, json_info)

print("Updated Profile,", profile)


def build_user_profile(text_file):
    profile = {
    "Name": "",
    "Email": "",
    "Phone": "",
    "Location": "",
    "Professional skills": [],
    "Organizations": [],
    "Hobbies": [],
    "Favorite activities": [],
    "Personal interests": [],
    "Daily routines": {
        "Morning": "",
        "Evening": "",
        "Exercise and health": "",
    },
    "Social media profiles": [],
    "Friends and family": [],
    "Favorite books, articles, and authors": [],
    "Favorite movies, shows, and genres": [],
    "Music preferences": [],
    "Preferred communication methods": [],
    "Writing style and tone": "",
    "Goals and aspirations": {
        "Short-term": [],
        "Long-term": [],
    },
    "Personal projects": [],
}

    process_text(text_file, profile)
    
    # Remove duplicates in lists
    for key in profile.keys():
        if isinstance(profile[key], list):
            profile[key] = list(set(profile[key]))
    
    return profile


dict_keys(['Name', 'Email', 'Phone', 'Location', 'Professional skills', 'Organizations', 'Hobbies', 'Favorite activities', 'Personal interests', 'Daily routines', 'Social media profiles', 'Friends and family', 'Favorite books, articles, and authors', 'Favorite movies, shows, and genres', 'Music preferences', 'Preferred communication methods', 'Writing style and tone', 'Short-term goals', 'Long-term goals', 'Personal projects', 'Other miscellaneous information'])
dict_keys(['Name', 'Email', 'Phone', 'Location', 'Professional skills', 'Organizations', 'Hobbies', 'Favorite activities', 'Personal interests', 'Daily routines', 'Social media profiles', 'Friends and family', 'Favorite books, articles, and authors', 'Favorite movies, shows, and genres', 'Music preferences', 'Preferred communication methods', 'Writing style and tone', 'Short-term goals', 'Long-term goals', 'Personal projects', 'Other miscellaneous information'])
Updated Profile, {'Name': 'Pranay Reddy', 'Email': 'pranayr@umass.