In [138]:
import pandas as pd
from datetime import datetime

def parse_date(date_str):
    if pd.isna(date_str) or date_str == 'NaN':
        return None  # Handle missing or NaN values
    try:
        return datetime.strptime(date_str, '%Y-%m-%d')
    except ValueError:
        try:
            return datetime.strptime(date_str, '%m/%d/%Y')
        except ValueError:
            return None  # Handle unrecognized date formats as needed       

In [139]:
# TOTAL YEARS OF EXPERIENCE
# Function to calculate total years of experience
def calculate_years_of_experience(employments):
    total_years = 0
    for employment in employments:
        start_date = parse_date(employment["started_on"])
        end_date = parse_date(employment["ended_on"]) if employment["ended_on"] != "NaN" else datetime.now()
        total_years += (end_date - start_date).days / 365.25
    return total_years

In [140]:
# AGGREGATE EXPERIENCE
from datetime import datetime, timedelta

def create_aggregate_experience(employments):
    total_years = 0
    total_revenue = 0
    for employment in employments:
        start_date = parse_date(employment["started_on"])
        if start_date is None:
            continue  # Skip this employment record if start_date is None
        
        end_date_str = employment["ended_on"]
        if end_date_str != "NaN":
            end_date = parse_date(end_date_str)
            if end_date is None:
                end_date = datetime.now()  # Use current date as end date if parsing fails
        else:
            end_date = datetime.now()  # Use current date as end date if "NaN"
        
        # Calculate years of experience
        if end_date > start_date:
            total_years += (end_date - start_date).days / 365.25
        
        # Accumulate revenue
        total_revenue += employment.get("company_revenue_amount_usd", 0)
    
    # Calculate average revenue
    average_revenue = total_revenue / len(employments) if employments else 0
    
    return total_years, average_revenue

In [141]:
# EDUCATIONAL SUMMARY 
def generate_education_summary(educations, name):
    if not isinstance(educations, list):
        return f"{name} has no recorded education history."

    if not educations:
        return f"{name} has no recorded education history."

    education_summaries = []
    for edu in educations:
        institution = edu["institution_name"]
        subject = edu.get("subject")
        degree = edu.get("degree")
        end_date = edu.get("ended_on")
        
        # Check for NaN or unknown values
        if pd.isna(subject) or subject == "NaN":
            continue
        if pd.isna(degree) or degree == "NaN":
            continue
        if pd.isna(end_date) or end_date == "NaN":
            continue
        
        # ended_on = edu.get("ended_on", "unknown graduation date")
        education_summary = f"{name} studied {subject} at {institution} and graduated in {end_date}."
        education_summaries.append(education_summary)

    return " ".join(education_summaries)


In [142]:
# EMPLOYMENT SUMMARY 
def generate_employment_summary(employments, name):
    if not employments:
        return f"{name} has no recorded work experience."

    employment_summaries = []
    for emp in employments:
        seniority = emp["seniority_level"]
        if pd.isna(seniority) or seniority == "NaN":
            continue

        started = "is currently working" if emp["ended_on"] == "NaN" else "has worked"
        if pd.isna(started) or started == "NaN":
            continue
        
        company_name = emp["company_name"]
        if pd.isna(company_name) or company_name == "NaN":
            continue

        headcount = emp.get("company_headcount")
        # Check for NaN or unknown values
        if pd.isna(headcount) or headcount == "NaN":
            continue

        # Handle revenue formatting
        revenue = emp.get("company_revenue_amount_usd")
        # Check for NaN or unknown values
        if pd.isna(revenue) or revenue == "NaN":
            continue
        if isinstance(revenue, str):
            try:
                revenue = float(revenue)
                revenue = f"{revenue:,.2f}"
            except ValueError:
                revenue = "unknown"
        elif isinstance(revenue, (int, float)):
            revenue = f"{revenue:,.2f}"

        # description = emp.get("position_description", "Unknown")

        summary = (f"{name} {started} as a {seniority} {emp['title']} at {company_name}, "
                   f" with {headcount} employees and "
                   f"a revenue of ${revenue}. "
                #    f"In this role, {name} was skilled in {description}. "
        )
        employment_summaries.append(summary)

    return " ".join(employment_summaries)


In [143]:
def generate_aggregate_summary(employments, name):
    if not isinstance(employments, list):
        return f"{name} has no recorded employment history."

    if not employments:
        return f"{name} has no recorded employment history."

    total_years, average_revenue = create_aggregate_experience(employments)
    job_titles = []
    companies = []

    for emp in employments:
        title = emp.get("title")
        company_name = emp.get("company_name")

        # Check for NaN or unknown values
        if pd.isna(title) or title == "NaN":
            continue
        if pd.isna(company_name) or company_name == "NaN":
            continue

        job_titles.append(title)
        companies.append(company_name)

    if not job_titles or not companies:
        return f"{name} has no valid recorded employment history."

    summary = (f"{name} has over {total_years:.1f} years of experience "
               f"across the following job titles: {', '.join(job_titles)} at companies: {', '.join(companies)} "
               f"with an average revenue size of ${average_revenue:,.2f}.")

    return summary


In [144]:
def generate_summary_content(record):
    name = record["name"]
    employments = record.get("employments", [])
    educations = record.get("education", [])
    

    employment_summary = generate_employment_summary(employments, name) if employments else None
    education_summary = generate_education_summary(educations, name) if educations else None
    aggregate_summary = generate_aggregate_summary(employments, name) if employments else None

    return {
        "employment_summary": employment_summary,
        "education_summary": education_summary,
        "aggregate_summary": aggregate_summary
    }

In [145]:
import re

In [146]:
# Sanitize file name by removing or replacing invalid characters
def sanitize_filename(filename):
    return re.sub(r'[\/:*?"<>|]', '_', filename)

In [147]:
import os  # Import the os module for file operations

# METHOD TO GENERATE THE PERSONS AGGREGATE, EDUCATIONAL AND EMPLOYMENT SUMMARY PARAGRAPH 
def main():
    # Load JSON data into a pandas DataFrame
    df = pd.read_json('input_people_data_02.json', lines=True)

    # Create a directory to store output files (if it doesn't exist)
    output_directory = 'output_files'
    os.makedirs(output_directory, exist_ok=True)

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        record = row.to_dict()
        summary_content = generate_summary_content(record)

        # Generate the file name based on the person's name and ID
        person_id = record["person_id"]
        name = record["name"].replace(" ", "_")
        sanitized_name = sanitize_filename(name)
        output_txt_file = os.path.join(output_directory, f'output_summary_{person_id}_{sanitized_name}.txt')

        # Save to individual TXT files
        with open(output_txt_file, 'w', encoding='utf-8') as file:
            employment_summary = summary_content.get('employment_summary')
            education_summary = summary_content.get('education_summary')
            aggregate_summary = summary_content.get('aggregate_summary')

            # Format each section as a full paragraph, skipping if None
            combined_summary = ""
            if employment_summary:
                combined_summary += "Employment Summary:\n" + employment_summary + "\n"
            if education_summary:
                combined_summary += "Education Summary:\n" + education_summary + "\n"
            if aggregate_summary:
                combined_summary += "Aggregate Summary:\n" + aggregate_summary + "\n"
            combined_summary += "-"*200 + "\n\n"

            file.write(combined_summary)

        print(f"Generated text summary saved to {output_txt_file}")

if __name__ == "__main__":
    main()


Generated text summary saved to output_files\output_summary_949256266_Vivian_Weng.txt
Generated text summary saved to output_files\output_summary_902184389_Hendrik_Bourgeois.txt
Generated text summary saved to output_files\output_summary_48171849_Courtney_Turner.txt
Generated text summary saved to output_files\output_summary_74707094_Katie_Richter.txt
Generated text summary saved to output_files\output_summary_1042153985_Victor_Merced-Felix.txt
Generated text summary saved to output_files\output_summary_787025923_Lynda_Zhang.txt
Generated text summary saved to output_files\output_summary_715859477_Tamara_Vranjes.txt
Generated text summary saved to output_files\output_summary_51432999_Juan_Sanchez.txt
Generated text summary saved to output_files\output_summary_71726687_Katherine_Hicks.txt
Generated text summary saved to output_files\output_summary_909645082_Lucas_Turrado.txt
Generated text summary saved to output_files\output_summary_20891932_Cynthia_Collins.txt
Generated text summary s