In [None]:
import pandas as pd
import pulp
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# =============================================================================
# STEP 0: LOAD AND PREPROCESS DATA
# =============================================================================

# File paths for your CSV files (adjust as needed)
judges_file = 'judges_with_expertise.csv'
posters_file = 'abstracts_8.csv'

# Load the CSV files
judges_df = pd.read_csv(judges_file)
posters_df = pd.read_csv(posters_file)

# ----- Preprocess Judges Data -----
# Ensure text fields are cleaned and filled
judges_df['Judge FirstName'] = judges_df['Judge FirstName'].astype(str).str.strip()
judges_df['Judge LastName'] = judges_df['Judge LastName'].astype(str).str.strip()
judges_df['Judge'] = judges_df['Judge'].astype(str).str.strip()
# Create a full name for each judge (for advisor conflict checking)
judges_df['JudgeFullName'] = judges_df['Judge FirstName'] + " " + judges_df['Judge LastName']
# Ensure Research_Expertise is a string and fill missing values
judges_df['Research_Expertise'] = judges_df['Research_Expertise'].fillna("").astype(str).str.strip()

# Map the "Hour available" into a list of available time slots.
def map_hour_available(val):
    val = str(val).strip().lower()
    if val == 'both':
        return [1, 2]
    elif val == '1':
        return [1]
    elif val == '2':
        return [2]
    else:
        return [1, 2]  # Default if unexpected
judges_df['AvailableHours'] = judges_df['Hour available'].apply(map_hour_available)

# ----- Preprocess Posters Data -----
# Ensure the Abstract field is a string and fill missing values
posters_df['Abstract'] = posters_df['Abstract'].fillna("").astype(str).str.strip()
# Create full advisor names (fill missing values to avoid errors)
posters_df['Advisor FirstName'] = posters_df['Advisor FirstName'].fillna("").astype(str).str.strip()
posters_df['Advisor LastName'] = posters_df['Advisor LastName'].fillna("").astype(str).str.strip()
posters_df['AdvisorFullName'] = posters_df['Advisor FirstName'] + " " + posters_df['Advisor LastName']

# Compute poster time slots: odd poster number → 1, even → 2.
posters_df['TimeSlot'] = posters_df['Poster #'].apply(lambda x: 1 if x % 2 == 1 else 2)

# ----- Map Program to Department using a Manual Keyword Dictionary -----
department_keywords = {
    "MAE": ["mechanical", "aerospace", "manufacturing", "robotics", "automation"],
    "BMCE": ["biomedical", "medicine", "health", "biomechanical", "bioengineering", "chemical"],
    "CEE": ["civil", "environmental", "structural", "geotechnical", "infrastructure"],
    "EECS": ["electrical", "computer", "electronics", "computing", "information", "cyber"]
}

def map_program_to_department(program_str):
    text = str(program_str).lower()
    for dept, keywords in department_keywords.items():
        for kw in keywords:
            if kw in text:
                return dept
    return "Unknown"

# Apply the mapping function to create a new column in posters_df.
posters_df['MappedDepartment'] = posters_df['Program'].apply(map_program_to_department)

# Create lists of judge IDs and poster numbers
judge_ids = judges_df['Judge'].tolist()
poster_numbers = posters_df['Poster #'].tolist()



In [None]:

# =============================================================================
# STEP 1: Compute Compatibility Using AllenAI SPECTER
# =============================================================================
# Use SPECTER—a SciBERT-based model fine-tuned for scientific documents.
# (Make sure you have installed the sentence-transformers package.)
model = SentenceTransformer("allenai-specter")

# Get the texts for embedding (assume these columns have been preprocessed in Step 0)
# For judges, we use their Research_Expertise text.
judge_texts = judges_df['Research_Expertise'].tolist()
# For posters, we use the Abstract.
poster_texts = posters_df['Abstract'].tolist()

# Compute embeddings for judges and posters (as NumPy arrays)
judge_embeddings = model.encode(judge_texts, convert_to_tensor=False)
poster_embeddings = model.encode(poster_texts, convert_to_tensor=False)

# Compute pairwise cosine similarity between each judge's expertise and each poster's abstract.
semantic_sim_matrix = cosine_similarity(judge_embeddings, poster_embeddings)
print("Semantic Similarity Matrix Shape:", semantic_sim_matrix.shape)

# Define weights for combining department matching and semantic similarity.
# Here, dept_weight is applied to the binary department match,
# and sim_weight is applied to the semantic similarity.
dept_weight = 0.5
sim_weight = 0.5

# Build a compatibility dictionary for each (judge, poster) pair.
# (Assume that in Step 0 you have created the manual mapping:
#   - posters_df['MappedDepartment'] via your manual keyword dictionary,
#   - and judges_df has a 'Department' column.)
compatibility = {}
for i, judge in enumerate(judge_ids):
    # Get judge's department (normalized to uppercase)
    dept_judge = judges_df.loc[judges_df['Judge'] == judge, 'Department'].iloc[0].strip().upper()
    for j, poster in enumerate(poster_numbers):
        # Get the poster's mapped department (normalized to uppercase)
        poster_dept = posters_df.loc[posters_df['Poster #'] == poster, 'MappedDepartment'].iloc[0].strip().upper()
        # Binary department match: 1 if equal, 0 otherwise.
        dept_match = 1 if dept_judge == poster_dept else 0
        # Retrieve the semantic similarity score from SPECTER.
        sim_score = semantic_sim_matrix[i, j]
        # Combine the two scores.
        compatibility[(judge, poster)] = dept_weight * dept_match + sim_weight * sim_score

# =============================================================================
# STEP 2: Build the PuLP Optimization Model
# =============================================================================

# Create a maximization problem.
prob = pulp.LpProblem("Poster_Judge_Assignment", pulp.LpMaximize)

# Define binary decision variables: assign_vars[(j, p)] = 1 if judge j is assigned to poster p.
assign_vars = pulp.LpVariable.dicts(
    "assign",
    [(j, p) for j in judge_ids for p in poster_numbers],
    cat=pulp.LpBinary
)

# Constraint 1: Each poster must be assigned exactly 2 judges.
for p in poster_numbers:
    prob += pulp.lpSum([assign_vars[(j, p)] for j in judge_ids]) == 2, f"Poster_{p}_assignment"

# Constraint 2: Each judge reviews at most 6 posters.
for j in judge_ids:
    prob += pulp.lpSum([assign_vars[(j, p)] for p in poster_numbers]) <= 6, f"Judge_{j}_limit"

# Constraint 3: Time Slot Availability.
# A judge can only review a poster if the poster's time slot is in the judge's AvailableHours.
for idx_j, row_j in judges_df.iterrows():
    j = row_j['Judge']
    available_hours = row_j['AvailableHours']
    for idx_p, row_p in posters_df.iterrows():
        p = row_p['Poster #']
        poster_time = row_p['TimeSlot']
        if poster_time not in available_hours:
            prob += assign_vars[(j, p)] == 0, f"TimeSlot_j{j}_p{p}"

# Constraint 4: Advisor Conflict.
# A judge should not be assigned to a poster if the judge is also the advisor.
for idx_j, row_j in judges_df.iterrows():
    j = row_j['Judge']
    judge_full = row_j['JudgeFullName'].lower().strip()
    for idx_p, row_p in posters_df.iterrows():
        p = row_p['Poster #']
        advisor_full = row_p['AdvisorFullName'].lower().strip()
        if judge_full == advisor_full:
            prob += assign_vars[(j, p)] == 0, f"AdvisorConflict_j{j}_p{p}"

# Objective: Maximize total compatibility score.
prob += pulp.lpSum(
    compatibility[(j, p)] * assign_vars[(j, p)] for j in judge_ids for p in poster_numbers
), "Total_Compatibility"

# =============================================================================
# STEP 3: Solve the Optimization Model
# =============================================================================

# Use the CBC solver (via COIN_CMD) with the appropriate path.
cbc_path = '/opt/homebrew/bin/cbc'  # Adjust this path as needed.
solver = pulp.COIN_CMD(path=cbc_path)
result_status = prob.solve(solver)
print("Solver Status:", pulp.LpStatus[prob.status])

# =============================================================================
# STEP 4: Generate and Save Output Files
# =============================================================================

# (A) Extended Poster File: Add columns 'judge-1' and 'judge-2'.
poster_assignments = posters_df.copy()
poster_assignments['judge-1'] = None
poster_assignments['judge-2'] = None

for idx, row in poster_assignments.iterrows():
    p = row['Poster #']
    assigned_judges = [j for j in judge_ids if pulp.value(assign_vars[(j, p)]) == 1]
    if len(assigned_judges) >= 1:
        poster_assignments.at[idx, 'judge-1'] = assigned_judges[0]
    if len(assigned_judges) >= 2:
        poster_assignments.at[idx, 'judge-2'] = assigned_judges[1]

poster_assignments.to_csv("posters_with_judges_specter.csv", index=False)
print("Extended Poster File saved as 'posters_with_judges_specter.csv'.")

# (B) Extended Judge File: List up to 6 poster assignments for each judge.
judge_assignments = judges_df.copy()
for i in range(1, 7):
    judge_assignments[f'poster-{i}'] = None

for idx, row in judges_df.iterrows():
    j = row['Judge']
    assigned_posters = [p for p in poster_numbers if pulp.value(assign_vars[(j, p)]) == 1]
    for k, p in enumerate(assigned_posters):
        judge_assignments.at[idx, f'poster-{k+1}'] = p

judge_assignments.to_csv("judges_with_posters_specter.csv", index=False)
print("Extended Judge File saved as 'judges_with_posters.csv'.")

# (C) Assignment Matrix File: Rows = posters; Columns = judges; cell = 1 if assigned, else 0.
assignment_matrix = pd.DataFrame(0, index=poster_numbers, columns=judge_ids)
for p in poster_numbers:
    for j in judge_ids:
        assignment_matrix.at[p, j] = int(pulp.value(assign_vars[(j, p)]))
assignment_matrix.to_csv("assignment_matrix.csv")
print("Assignment Matrix File saved as 'assignment_matrix.csv'.")
