<a href="https://colab.research.google.com/github/rahulamatapu/Community-Perspectives/blob/master/biased_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install langchain langchain_experimental openai
!pip install -U langchain-openai

from langchain_openai import ChatOpenAI
# set environment variables
# https://platform.openai.com/account/api-keys
import os
os.environ["OPENAI_API_KEY"] = ""


from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

In [None]:
import random
import itertools
from pydantic import BaseModel, Field
from pydantic import ConfigDict
from typing import Optional
from typing import List
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI

In [None]:
from typing import List, Dict, Tuple
from pydantic.v1 import BaseModel
# Scenario Information


# Persona Information
class PersonaInformation(BaseModel):
    persona_id: str
    community: str
    age: int
    occupation: str
    household_composition: str
    disaster_experience: str
    socioeconomic_status: str
    community_description: str


# Response Data
    class Config:
        arbitrary_types_allowed = True

class ResponseData(BaseModel):
    response_id: str
    scenario_id: str
    persona_id: str
    question_id: str
    option1: str
    option2: str
    priority_choice: str
    reasoning: str
    ordered_pair: List[str]

# Scenario Context
'''
class ScenarioContext(BaseModel):
    scenario_id: str
    description: str
    model_config = ConfigDict(arbitrary_types_allowed=True)
'''
# Community Context
class CommunityContext(BaseModel):
    community_id: str
    social_vulnerability_score: int
    access_to_resources: str
    population: int

# Question, we generate questions seperately below using a script
class Question(BaseModel):
    question_id: str
    option1: str
    option2: str
    text: str

In [None]:
community_contexts = {
    "Community 1": CommunityContext(
        community_id="C001",
        social_vulnerability_score=7,
        access_to_resources="Limited",
        population=5000
    ),
    "Community 2": CommunityContext(
        community_id="C002",
        social_vulnerability_score=4,
        access_to_resources="Moderate",
        population=12000
    ),
    "Community 3": CommunityContext(
        community_id="C003",
        social_vulnerability_score=2,
        access_to_resources="Good",
        population=20000
    )
}

infrastructure_status = {
    "Community 1": {
        "Power School": 1, "Water School": 0, "Power House": 1, "Water House": 1,
        "Power Residential": 0, "Water Residential": 0, "Power Commercial": 1, "Water Commercial": 1
    },
    "Community 2": {
        "Power School": 1, "Water School": 1, "Power House": 1, "Water House": 1,
        "Power Residential": 0, "Water Residential": 0, "Power Commercial": 1, "Water Commercial": 0
    },
    "Community 3": {
        "Power School": 1, "Water School": 0, "Power House": 1, "Water House": 1,
        "Power Residential": 0, "Water Residential": 0, "Power Commercial": 1, "Water Commercial": 1
    }
}

In [None]:
def generate_community_description(community, status, context):
    working = [k for k, v in status.items() if v == 1]
    damaged = [k for k, v in status.items() if v == 0]
    return f"In {community}: " \
           f"- Working infrastructure: {', '.join(working)}; " \
           f"- Damaged infrastructure: {', '.join(damaged)}; " \
           f"- Population: {context.population}; " \
           f"- Social vulnerability score: {context.social_vulnerability_score}/10; " \
           f"- Access to resources: {context.access_to_resources}."


def generate_all_possible_questions():
    all_damaged = []
    for community, status in infrastructure_status.items():
        damaged = [(community, infra) for infra, working in status.items() if working == 0]
        all_damaged.extend(damaged)

    all_combinations = list(itertools.combinations(all_damaged, 2))

    questions = []
    for i, pair in enumerate(all_combinations, 1):
        question = Question(
            question_id=f"Q{i:03d}",
            option1=f"Repair {pair[0][1]} in {pair[0][0]}",
            option2=f"Repair {pair[1][1]} in {pair[1][0]}",
            text=f"Which should be repaired first: {pair[0][1]} in {pair[0][0]} or {pair[1][1]} in {pair[1][0]}?"
        )
        questions.append(question)

    return questions

def validate_response(response: ResponseData, persona: PersonaInformation, question: Question) -> Tuple[bool, str]:
    if response.priority_choice not in [question.option1, question.option2]:
        return False, f"Invalid priority choice: {response.priority_choice}"
    if set(response.ordered_pair) != set([question.option1, question.option2]):
        return False, f"Invalid ordered pair: {response.ordered_pair}"
    if persona.community not in response.reasoning:
        return False, f"Community {persona.community} not mentioned in reasoning"
    if persona.occupation.lower() not in response.reasoning.lower():
        return False, f"Occupation {persona.occupation} not mentioned in reasoning"
    return True, "Valid response"

In [None]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

persona_examples = [
    {
        "example": """Persona ID: P001, Community: Community 1, Age: 35, Occupation: Teacher,
        Household Composition: Married with children, Disaster Experience: Experienced a flood 5 years ago,
        Socioeconomic Status: Middle class, Community Description: Population 5000, Social vulnerability score 7/10,
        Limited access to resources"""
    },
    {
        "example": """Persona ID: P002, Community: Community 2, Age: 52, Occupation: Nurse,
        Household Composition: Single, Disaster Experience: No major disaster experience,
        Socioeconomic Status: Middle class, Community Description: Population 12000, Social vulnerability score 4/10,
        Moderate access to resources"""
    },
    {
        "example": """Persona ID: P003, Community: Community 3, Age: 68, Occupation: Retired,
        Household Composition: Married, Disaster Experience: Survived a hurricane last year,
        Socioeconomic Status: Upper middle class, Community Description: Population 20000, Social vulnerability score 2/10,
        Good access to resources"""
    },
]

response_examples = [
    {
        "example": """Response ID: R001, Scenario ID: S001, Persona ID: P001, Question ID: Q001,
        Option 1: Repair Water School in Community 1, Option 2: Repair Power Residential in Community 2,
        Priority Choice: Repair Water School in Community 1,
        Reasoning: As a teacher in Community 1, I believe access to clean water in schools is crucial for maintaining hygiene and preventing the spread of diseases, especially after a disaster. Our community has limited resources and a high social vulnerability score, so ensuring our schools are operational is a top priority for education continuity and community resilience.
        Ordered Pair: ["Repair Water School in Community 1", "Repair Power Residential in Community 2"]"""
    },
    {
        "example": """Response ID: R002, Scenario ID: S001, Persona ID: P002, Question ID: Q002,
        Option 1: Repair Power Residential in Community 2, Option 2: Repair Water Residential in Community 3,
        Priority Choice: Repair Power Residential in Community 2,
        Reasoning: As a nurse in Community 2, I understand the critical importance of electrical power for medical equipment and maintaining communication during emergencies. Our community has moderate access to resources, and restoring power to residential areas will help more people in our larger population of 12,000 to manage their health needs at home, potentially reducing strain on medical facilities.
        Ordered Pair: ["Repair Power Residential in Community 2", "Repair Water Residential in Community 3"]"""
    },
    {
        "example": """Response ID: R003, Scenario ID: S001, Persona ID: P003, Question ID: Q003,
        Option 1: Repair Water Residential in Community 1, Option 2: Repair Power School in Community 3,
        Priority Choice: Repair Water Residential in Community 1,
        Reasoning: As a retired person living in Community 3, I recognize the dire need for water in residential areas of Community 1. Although I'm from a community with good access to resources, I believe we should prioritize the basic needs of the most vulnerable. Community 1 has a higher social vulnerability score and limited access to resources, making clean water for their homes a critical priority for health and sanitation.
        Ordered Pair: ["Repair Water Residential in Community 1", "Repair Power School in Community 3"]"""
    },
    {
        "example": """Response ID: R004, Scenario ID: S001, Persona ID: P004, Question ID: Q004,
        Option 1: Repair Power Commercial in Community 2, Option 2: Repair Water School in Community 3,
        Priority Choice: Repair Power Commercial in Community 2,
        Reasoning: As a small business owner in Community 2, I believe restoring power to commercial areas is crucial for economic recovery after a disaster. Our community has a moderate population of 12,000 and restoring commercial power will help businesses reopen, provide essential services, and support the local economy. This can indirectly benefit the entire region by maintaining supply chains and employment.
        Ordered Pair: ["Repair Power Commercial in Community 2", "Repair Water School in Community 3"]"""
    },
    {
        "example": """Response ID: R005, Scenario ID: S001, Persona ID: P005, Question ID: Q005,
        Option 1: Repair Water Commercial in Community 1, Option 2: Repair Power Residential in Community 3,
        Priority Choice: Repair Water Commercial in Community 1,
        Reasoning: As a public health official in Community 1, I prioritize repairing the water supply to commercial areas in our community. Given our high social vulnerability score of 7/10 and limited access to resources, restoring water to commercial areas will allow essential businesses like pharmacies and food stores to operate, supporting the basic needs of our 5,000 residents. This is critical for preventing secondary health crises in our vulnerable population.
        Ordered Pair: ["Repair Water Commercial in Community 1", "Repair Power Residential in Community 3"]"""
    }
]

# Create prompt templates
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

persona_prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX + """
    Generate diverse personas for a disaster scenario study. Ensure variety in:
    - Age: Include a range from teenagers to elderly (16-80+)
    - Occupations: Vary widely, including unemployed and students
    - Household compositions: Include single parents, multigenerational households, etc.
    - Disaster experiences: Some with no experience, others with various past experiences
    - Socioeconomic statuses: Ensure representation across all economic levels
    - Physical abilities: Include some personas with disabilities
    - Cultural backgrounds: Represent diverse ethnicities and cultural practices
    - Length of residency in the community: Mix of newcomers and long-time residents
    - Special circumstances: e.g., caretakers, pregnant, etc.

    Each persona should be unique and realistic.
    """,
    examples=persona_examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

response_prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=response_examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX + "\n\nPersona: {persona}\nQuestion: {question}\nGenerate a response:",
    input_variables=["subject", "extra", "persona", "question"],
    example_prompt=OPENAI_TEMPLATE,
)

# Create generators
persona_generator = create_openai_data_generator(
    output_schema=PersonaInformation,
    llm=ChatOpenAI(temperature=0.9),
    prompt=persona_prompt_template,
)

response_generator = create_openai_data_generator(
    output_schema=ResponseData,
    llm=ChatOpenAI(temperature=0.7),
    prompt=response_prompt_template,
)

##Persona Generation

In [None]:
import random

def generate_personas(num_personas: int) -> List[PersonaInformation]:
    communities = list(community_contexts.keys())
    community = communities[0]  # Start with the first community
    context = community_contexts[community]
    community_desc = generate_community_description(community, infrastructure_status[community], context)

    occupations = {
        "Healthcare Worker": 2, "Teacher": 2, "First Responder": 2,
        "Local Government Employee": 2, "Small Business Owner": 2,
        "Factory Worker": 2, "Farmer": 2, "Retired": 2,
        "Unemployed": 2, "Student": 2, "Software Developer": 2,"Lawyer": 2, "Nurse":2,
        "Construction Worker": 2, "Social Worker": 2, "Accountant": 2
    }

    household_compositions = [
        "Single", "Married without children", "Married with young children",
        "Single parent", "Empty nester", "Multigenerational household",
        "Cohabiting partners", "Living with roommates"
    ]

    disaster_experiences = [
        "No major disaster experience",
        "Survived a flood 3 years ago",
        "Experienced a severe hurricane last year",
        "Lived through an earthquake 5 years ago",
        "Dealt with a major wildfire 2 years ago",
        "Multiple experiences with various natural disasters",
        "Volunteered in disaster relief efforts",
        "Lost home in a tornado 4 years ago"
    ]

    socioeconomic_statuses = ["Lower class", "Lower middle class", "Middle class", "Upper middle class", "Upper class"]

    additional_traits = [
        "Recent immigrant", "Long-time community leader", "Person with a physical disability",
        "Caregiver for elderly parent", "LGBTQ+ community member", "Veteran",
        "Climate change activist", "Technology enthusiast", "Local sports coach"
    ]

    # Create a list of occupations based on their weights
    weighted_occupations = [occupation for occupation, weight in occupations.items() for _ in range(weight)]

    extra_prompt = f"""Create {num_personas} unique personas with diverse characteristics including:
    - Ages ranging from 18 to 85+
    - Occupations from this list (some may appear more often): {', '.join(occupations.keys())}
    - Household compositions from this list: {', '.join(household_compositions)}
    - Disaster experiences from this list: {', '.join(disaster_experiences)}
    - Socioeconomic statuses from this list: {', '.join(socioeconomic_statuses)}
    - Additional traits (include one for each persona) from this list: {', '.join(additional_traits)}
    At least one persona should be from {community}. Community description for {community}: {community_desc}.
    Ensure a diverse mix of characteristics across all personas, with a focus on how these characteristics might influence disaster preparedness and response."""

    synthetic_personas = persona_generator.generate(
        subject="storm_scenario_persona",
        extra=extra_prompt,
        runs=num_personas
    )[:num_personas]

    print(f"Generated {len(synthetic_personas)} personas.")

    # Assign communities and IDs to personas
    for i, persona in enumerate(synthetic_personas):
        persona.persona_id = f"P{i+1:03d}"
        if i == 0:
            persona.community = community
            persona.community_description = community_desc
        else:
            assigned_community = communities[i % len(communities)]
            context = community_contexts[assigned_community]
            assigned_community_desc = generate_community_description(assigned_community, infrastructure_status[assigned_community], context)
            persona.community = assigned_community
            persona.community_description = assigned_community_desc

        # Ensure the additional trait is included in the community description
        if "Additional trait:" not in persona.community_description:
            trait = random.choice(additional_traits)
            persona.community_description += f" Additional trait: {trait}."

        # If no occupation is assigned, assign one from the weighted list
        if not hasattr(persona, 'occupation') or not persona.occupation:
            #print("Occupation not assigned:")
            persona.occupation = random.choice(weighted_occupations)

    return synthetic_personas

# Generate personas
num_personas_to_generate = 10
synthetic_personas = generate_personas(num_personas_to_generate)

# Print generated personas
for persona in synthetic_personas:
    print(persona.json(indent=2))

Generated 10 personas.
{
  "persona_id": "P001",
  "community": "Community 1",
  "age": 35,
  "occupation": "Teacher",
  "household_composition": "Married with children",
  "disaster_experience": "Experienced a flood 5 years ago",
  "socioeconomic_status": "Middle class",
  "community_description": "In Community 1: - Working infrastructure: Power School, Power House, Water House, Power Commercial, Water Commercial; - Damaged infrastructure: Water School, Power Residential, Water Residential; - Population: 5000; - Social vulnerability score: 7/10; - Access to resources: Limited. Additional trait: Technology enthusiast."
}
{
  "persona_id": "P002",
  "community": "Community 2",
  "age": 22,
  "occupation": "Student",
  "household_composition": "Living with roommates",
  "disaster_experience": "No major disaster experience",
  "socioeconomic_status": "Lower class",
  "community_description": "In Community 2: - Working infrastructure: Power School, Water School, Power House, Water House, P

##Response Generation

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage
import random
from typing import List, Optional
import pandas as pd

# Initialize the language model
llm = ChatOpenAI(temperature=0.7)

RESPONSES_PER_PERSONA = 3

def generate_responses_for_personas(
    personas: List[PersonaInformation],
    questions: List[Question],
    responses_per_persona: int = RESPONSES_PER_PERSONA,
    max_attempts: int = 5
) -> List[dict]:
    responses = []
    response_counter = 1

    for persona in personas:
        print(f"Generating responses for Persona ID: {persona.persona_id}, Occupation: {persona.occupation}")
        selected_questions = random.sample(questions, min(responses_per_persona, len(questions)))

        for question in selected_questions:
            print(f"\nProcessing question: {question.text}")
            valid_response = None
            attempts = 0

            while valid_response is None and attempts < max_attempts:
                try:
                    prompt = PromptTemplate(
                        input_variables=["persona", "question", "option1", "option2", "community_description"],
                        template="""You are a {persona}. Please answer the following question: '{question}'
                        You must choose between '{option1}' and '{option2}'.
                        Provide a detailed reasoning specific to these options and your background.
                        Your response MUST include both a priority choice and a reasoning.

                        Community description: {community_description}

                        Respond in the following format:
                        Priority Choice: [Your chosen option]
                        Reasoning: [Your detailed reasoning]"""
                    )

                    formatted_prompt = prompt.format(
                        persona=f"{persona.occupation} from {persona.community}",
                        question=question.text,
                        option1=question.option1,
                        option2=question.option2,
                        community_description=persona.community_description
                    )

                    response = llm.invoke([HumanMessage(content=formatted_prompt)])

                    # Parse the response
                    response_text = response.content
                    print(f"Response: {response_text}")
                    lines = response_text.split('\n')
                    priority_choice = next((line.split(': ', 1)[1].strip() for line in lines if line.startswith("Priority Choice:")), None)
                    reasoning = next((line.split(': ', 1)[1].strip() for line in lines if line.startswith("Reasoning:")), None)

                    if priority_choice not in [question.option1, question.option2]:
                        print(f"Invalid priority choice: {priority_choice}. Retrying.")
                        attempts += 1
                        continue

                    if not reasoning:
                        print("No reasoning provided. Retrying.")
                        attempts += 1
                        continue

                    ordered_pair = [
                        priority_choice,
                        question.option2 if priority_choice == question.option1 else question.option1
                    ]

                    response_data = ResponseData(
                        response_id=f"R{response_counter:03d}",
                        scenario_id="S001",
                        persona_id=persona.persona_id,
                        question_id=question.question_id,
                        option1=question.option1,
                        option2=question.option2,
                        priority_choice=priority_choice,
                        reasoning=reasoning,
                        ordered_pair=ordered_pair
                    )

                    validation_result, validation_message = validate_response(response_data, persona, question)
                    if validation_result:
                        valid_response = {
                            **response_data.dict(),
                            'community': persona.community,
                            'age': persona.age,
                            'occupation': persona.occupation,
                            'household_composition': persona.household_composition,
                            'disaster_experience': persona.disaster_experience,
                            'socioeconomic_status': persona.socioeconomic_status,
                            'community_description': persona.community_description
                        }
                        responses.append(valid_response)
                        response_counter += 1
                        print("Valid response generated.")
                    else:
                        print(f"Response validation failed: {validation_message}. Retrying...")

                except Exception as e:
                    print(f"Error generating response: {str(e)}")

                attempts += 1

            if valid_response is None:
                print(f"Failed to generate valid response after {max_attempts} attempts")

    return responses

# Generate personas
num_personas_to_generate = 3
print("\n--- Generating Personas ---")
synthetic_personas = generate_personas(num_personas_to_generate)

print("\n--- Summary of Generated Personas ---")
for persona in synthetic_personas:
    print(f"Persona ID: {persona.persona_id}, Occupation: {persona.occupation}, Community: {persona.community}")

# Generate questions
questions = generate_all_possible_questions()

# Generate responses
responses = generate_responses_for_personas(synthetic_personas, questions)

# Create DataFrame for responses (which now include persona information)
responses_df = pd.DataFrame(responses)

# Save to a single CSV file
responses_df.to_csv('personas_and_responses.csv', index=False)
print("\nPersona and Response information has been written to personas_and_responses.csv")

# Print results
print(f"\nGenerated {len(responses)} total responses.")
print(responses_df[['response_id', 'persona_id', 'community', 'occupation', 'priority_choice']])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Processing question: Which should be repaired first: Power Residential in Community 1 or Water Commercial in Community 2?
Response: Priority Choice: Repair Water Commercial in Community 2

Reasoning: As a Healthcare Worker from Community 2 with a background in understanding the importance of access to clean water for public health, I believe that repairing the Water Commercial infrastructure in Community 2 should be the priority. Water is essential for sanitation, hygiene, and overall well-being, especially in a community with a population of 12000 people.

The Water Commercial infrastructure serves a larger number of people compared to the Power Residential infrastructure in Community 1. Additionally, given the recent immigrant status of some residents in Community 2, access to clean water is crucial for their settlement and integration into the community. Furthermore, a functioning Water Commercial infrastructure is es

##Run this!!

In [None]:
import pandas as pd
import ast

# Read the CSV file
df = pd.read_csv('personas_and_responses.csv')

# Print column names
print(df.columns)

# Function to extract community details
def extract_community_details(description):
    details = {}
    parts = description.split(';')
    for part in parts:
        if 'Working infrastructure:' in part:
            details['working_infrastructure'] = part.split(':')[1].strip()
        elif 'Damaged infrastructure:' in part:
            details['damaged_infrastructure'] = part.split(':')[1].strip()
        elif 'Population:' in part:
            details['population'] = part.split(':')[1].strip()
        elif 'Social vulnerability score:' in part:
            details['social_vulnerability_score'] = part.split(':')[1].strip()
        elif 'Access to resources:' in part:
            details['access_to_resources'] = part.split(':')[1].strip()
    if 'Additional trait:' in description:
        details['additional_trait'] = description.split('Additional trait:')[1].strip()
    return details

# Process the dataframe
def process_dataframe(df):
    # Generate response_id
    df['response_id'] = ['R' + str(i+1).zfill(3) for i in range(len(df))]

    # Add scenario_id (assuming it's constant for all rows)
    df['scenario_id'] = 'S001'

    # Extract community details
    community_details = df['community_description'].apply(extract_community_details)

    # Update dataframe with extracted details
    df['working_infrastructure'] = community_details.apply(lambda x: x.get('working_infrastructure', ''))
    df['damaged_infrastructure'] = community_details.apply(lambda x: x.get('damaged_infrastructure', ''))
    df['population'] = community_details.apply(lambda x: x.get('population', ''))
    df['social_vulnerability_score'] = community_details.apply(lambda x: x.get('social_vulnerability_score', ''))
    df['access_to_resources'] = community_details.apply(lambda x: x.get('access_to_resources', ''))
    df['additional_trait'] = community_details.apply(lambda x: x.get('additional_trait', ''))

    # Ensure 'ordered_pair' is a string representation of a list
    df['ordered_pair'] = df['ordered_pair'].apply(lambda x: str(x) if isinstance(x, list) else x)

    # Reorder columns
    column_order = [
    'persona_id', 'community', 'age', 'occupation', 'household_composition',
    'disaster_experience', 'socioeconomic_status', 'community_description',
    'question_id', 'option1', 'option2', 'response_id', 'priority_choice', 'ordered_pair', 'reasoning'
]

    return df[column_order]

# Process the dataframe
df_processed = process_dataframe(df)

# Save the processed dataframe to a new CSV file
df_processed.to_csv('processed_data.csv', index=False)

print("Data has been processed and saved to 'processed_data.csv'")

Index(['response_id', 'scenario_id', 'persona_id', 'question_id', 'option1',
       'option2', 'priority_choice', 'reasoning', 'ordered_pair', 'community',
       'age', 'occupation', 'household_composition', 'disaster_experience',
       'socioeconomic_status', 'community_description'],
      dtype='object')
Data has been processed and saved to 'processed_data.csv'


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Making data fit for model

In [None]:
import pandas as pd
import ast

# Read the CSV file
df = pd.read_csv('personas_and_responses.csv')

# Function to extract community details
def extract_community_details(description):
    details = {}
    parts = description.split(';')
    for part in parts:
        if 'Population:' in part:
            details['population'] = part.split(':')[1].strip()
        elif 'Social vulnerability score:' in part:
            details['social_vulnerability_score'] = part.split(':')[1].strip()
        elif 'Access to resources:' in part:
            details['access_to_resources'] = part.split(':')[1].strip()
    return details

# Process the dataframe
def process_dataframe(df):
    # Extract community details
    community_details = df['community_description'].apply(extract_community_details)

    # Update dataframe with extracted details
    df['population'] = community_details.apply(lambda x: x.get('population', ''))
    df['social_vulnerability_score'] = community_details.apply(lambda x: x.get('social_vulnerability_score', ''))
    df['access_to_resources'] = community_details.apply(lambda x: x.get('access_to_resources', ''))

    # Convert population to numeric
    df['population'] = pd.to_numeric(df['population'], errors='coerce')

    # Convert social vulnerability score to numeric (assuming it's in format "X/10")
    df['social_vulnerability_score'] = df['social_vulnerability_score'].apply(lambda x: float(x.split('/')[0]) / 10)

    # Encode access to resources
    resource_mapping = {'Limited': 0, 'Moderate': 1, 'Good': 2}
    df['access_to_resources'] = df['access_to_resources'].map(resource_mapping)

    # Ensure 'ordered_pair' is a list
    df['ordered_pair'] = df['ordered_pair'].apply(ast.literal_eval)

    # Create separate columns for option1 and option2
    df['option1'] = df['ordered_pair'].apply(lambda x: x[0])
    df['option2'] = df['ordered_pair'].apply(lambda x: x[1])

    # Reorder columns
    column_order = [
        'persona_id', 'community', 'population', 'social_vulnerability_score', 'access_to_resources',
        'question_id', 'option1', 'option2', 'priority_choice', 'ordered_pair',
    ]

    return df[column_order]

# Process the dataframe
df_processed = process_dataframe(df)

# Save the processed dataframe to a new CSV file
df_processed.to_csv('community_focused_data.csv', index=False)

print("Data has been processed and saved to 'community_focused_data.csv'")

Data has been processed and saved to 'community_focused_data.csv'


In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('community_focused_data.csv')

# Save as Excel file
df.to_excel('output_communities.xlsx', index=False)

print("Data has been converted to Excel format: output_communities.xlsx")

Data has been converted to Excel format: output_communities.xlsx


##Processed pairwise data

In [None]:
import pandas as pd
import ast

# Define column names
columns = ['persona_id', 'community', 'population', 'social_vulnerability_score', 'access_to_resources',
           'question_id', 'option1', 'option2', 'priority_choice', 'ordered_pair']

# Read the CSV file, specifying the column names and that the file uses comma as separator
df = pd.read_csv('community_focused_data.csv', names=columns, skiprows=1)

# Function to create pairwise comparisons
def create_pairwise_data(row):
    options = ast.literal_eval(row['ordered_pair'])
    return pd.DataFrame({
        'persona_id': [row['persona_id'], row['persona_id']],
        'community': [row['community'], row['community']],
        'population': [row['population'], row['population']],
        'social_vulnerability_score': [row['social_vulnerability_score'], row['social_vulnerability_score']],
        'access_to_resources': [row['access_to_resources'], row['access_to_resources']],
        'option': options,
        'label': [1, 0]  # 1 for chosen option (first in ordered pair), 0 for not chosen
    })

# Create pairwise dataset
pairwise_df = pd.concat(df.apply(create_pairwise_data, axis=1).tolist(), ignore_index=True)

# Extract the community number from the option
pairwise_df['target_community'] = pairwise_df['option'].str.extract(r'Community (\d+)')

# Normalize numerical features
pairwise_df['population'] = (pairwise_df['population'] - pairwise_df['population'].min()) / (pairwise_df['population'].max() - pairwise_df['population'].min())
pairwise_df['social_vulnerability_score'] = pd.to_numeric(pairwise_df['social_vulnerability_score'])

# Create feature vectors
feature_columns = ['population', 'social_vulnerability_score', 'access_to_resources', 'target_community']

# Save the processed dataframe
pairwise_df.to_csv('processed_pairwise_data.csv', index=False)

print("Data has been processed and saved to 'processed_pairwise_data.csv'")

Data has been processed and saved to 'processed_pairwise_data.csv'


In [None]:
import pandas as pd
import ast

# Define column names
columns = ['persona_id', 'community', 'population', 'social_vulnerability_score', 'access_to_resources',
           'question_id', 'option1', 'option2', 'priority_choice', 'ordered_pair']

# Read the CSV file, specifying the column names and that the file uses comma as separator
df = pd.read_csv('community_focused_data.csv', names=columns, skiprows=1)

# Function to create pairwise comparisons
def create_pairwise_data(row):
    options = ast.literal_eval(row['ordered_pair'])
    return pd.DataFrame({
        'persona_id': row['persona_id'],
        'community': row['community'],
        'population': row['population'],
        'social_vulnerability_score': row['social_vulnerability_score'],
        'access_to_resources': row['access_to_resources'],
        'question_id': row['question_id'],
        'option_a': options[0],
        'option_b': options[1],
        'chosen': 1 if row['priority_choice'] == options[0] else -1
    }, index=[0])

# Create pairwise dataset
pairwise_df = pd.concat(df.apply(create_pairwise_data, axis=1).tolist(), ignore_index=True)

# Extract features from options
pairwise_df['infrastructure_a'] = pairwise_df['option_a'].str.split().str[1]
pairwise_df['community_a'] = pairwise_df['option_a'].str.extract(r'Community (\d+)').astype(int)
pairwise_df['infrastructure_b'] = pairwise_df['option_b'].str.split().str[1]
pairwise_df['community_b'] = pairwise_df['option_b'].str.extract(r'Community (\d+)').astype(int)

# Save the processed dataframe
pairwise_df.to_csv('processed_pairwise_data.csv', index=False)

print("Data has been processed and saved to 'processed_pairwise_data.csv'")

Data has been processed and saved to 'processed_pairwise_data.csv'
