In [50]:
import pandas as pd
import ast
import numpy as np
import pulp
import itertools
import re

In [3]:
sets = pd.read_csv("sets.csv")
students = pd.read_csv("students.csv")

In [5]:
def create_projects_list(sets):
    rows = []
    for i, row in sets.iterrows():
        for i in range(row.available):
            rows.append(
                {
                    "username": row.username,
                    "keywords": row.keywords,
                    "types": row.type,
                    "prerequisite": row.prerequisite
                }
            )
    return pd.DataFrame(rows)

In [6]:
projects = create_projects_list(sets)

In [60]:
def student_project_scores(label, projects, students):
    s_list = []

    for i, s in students.iterrows():
        p_list = []

        list1 = getattr(s, label).split(",")
        
        for j, p in projects.iterrows():
            
            list2 = getattr(p, label).split(",")

            score = score_preference(list1, list2)

            p_list.append(score)

        s_list.append(p_list)

    return np.array(s_list)

def score_preference(pref, lst):
    score = 0
    pref_reverse = pref[::-1]

    for i, item in enumerate(pref_reverse):
        if item in lst:
            score += i + 1

    return score 

def normalize_2d_array(arr):
    min_val = np.min(arr)
    max_val = np.max(arr)
    normalzed_array = (arr - min_val) / (max_val - min_val)
    return normalzed_array

def rank_student_projects(combined_scores):
    student_project = []
    for item in combined_scores:
        student_project.append(item.argsort()[::-1].argsort()+1)
    return student_project

def allocate(A):
    A = np.matrix(A)
    n_students, n_projects = A.shape
    prob = pulp.LpProblem("Project Allocation", pulp.LpMinimize)
    x = pulp.LpVariable.dicts("x", itertools.product(range(n_students), range(n_projects)), cat=pulp.LpBinary)

    objective_function = 0
    for student in range(n_students):

        for project in range(n_projects):

            if A[(student, project)] > 0:
                objective_function += x[(student, project)] * A[(student, project)]

    prob += objective_function

    # 1 project per student
    for student in range(n_students):
        prob += sum(x[(student, project)] for project in range(n_projects)) == 1

    # Each project is assigned to at most 1 student
    for project in range(n_projects):
        prob += sum(x[(student, project)] for student in range(n_students)) <= 1

    prob.solve()

    return prob

def get_allocation_coords(prob):

    allocation = []

    for v in prob.variables():
        if v.varValue == 1:
            stringy_coords = v.name.split(',')
            allocation.append([int(re.findall(r"[\d]+|\d+", x)[0]) for x in stringy_coords])


    return(allocation)

def prerequisite_filter(scores, students, projects):
    new_scores = []

    for i, s in students.iterrows():

        try:
            student_modules = s.prerequisites.split(",")
        except:
            student_modules = []

        new_scores_row = []

        for j, p in projects.iterrows():

            prerequisite = p.prerequisite

            if isinstance(prerequisite, str):
                print(f"Prerequisite is a string: {prerequisite}")
                if any(prerequisite in word for word in student_modules):
                    print(f"{prerequisite} in {student_modules}")
                    new_scores_row.append(scores[i][j])
                else:
                    print(f"{prerequisite} not in {student_modules}")
                    new_scores_row.append(0)
            else:
                print("Prerequisite not a string")
                new_scores_row.append(scores[i][j])

        new_scores.append(new_scores_row)

    return np.array(new_scores)
        

In [52]:
keyword_scores = student_project_scores("keywords", projects, students)

In [9]:
type_scores = student_project_scores("types", projects, students)

In [56]:
scores = normalize_2d_array(keyword_scores + type_scores)

In [59]:
scores = prerequisite_filter(scores, students, projects)

AttributeError: 'Series' object has no attribute 'prerequisites'

In [54]:
student_project = rank_student_projects(scores)

In [20]:
import numpy as np
from scipy.optimize import linprog

def allocate(A):
    A = np.array(A)  # Convert to a NumPy array if it isn't already
    n_students, n_projects = A.shape

    # Create the objective function coefficients
    c = -A.flatten()  # Use negative because linprog does minimization

    # Create the equality and inequality constraints
    A_eq = []
    b_eq = []

    # Constraint: Each student is assigned to exactly one project
    for student in range(n_students):
        row = np.zeros(n_students * n_projects)
        for project in range(n_projects):
            row[student * n_projects + project] = 1
        A_eq.append(row)
        b_eq.append(1)
    
    A_eq = np.array(A_eq)
    b_eq = np.array(b_eq)

    # No need for inequality constraints for project assignments
    # because we will use bounds to ensure 0 or 1 for each variable

    # Bounds for the variables: 0 <= x <= 1 (binary constraints)
    bounds = [(0, 1) for _ in range(n_students * n_projects)]

    # Solve the linear programming problem
    result = linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')

    # Reshape the result back into the matrix form
    allocation = np.round(result.x).reshape((n_students, n_projects))

    return allocation


In [21]:
results = allocate(student_project)

In [26]:
results[100]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [114]:
series = []
score = []

for [student, project] in allocation:
    
    s = students.iloc[student]
    p = projects.iloc[project]

    # rename s columns with _student suffix
    s = s.add_suffix("_student")

    # rename p columns with _project suffix
    p = p.add_suffix("_project")

    # combine p and s into one series
    series.append(pd.concat([p, s]))

    score.append(student_project[student][project])

df = pd.DataFrame(series)
#df[score] = score

In [116]:
df["score"] = score

In [117]:
df.head()

Unnamed: 0,username_project,keywords_project,types_project,prerequisite_project,student_id_student,programme_student,types_student,keywords_student,prerequisites_student,score
0,manolis,Molecular cell biology,Data analysis,,123456789,Biochemistry,"Data analysis,GIS analysis,Grant proposal,Imag...","Molecular cell biology,Multi-omics,One health,...","LIFE203,LIFE206,LIFE210,LIFE213,LIFE216",6
1,tmehta12,"Bioinformatics,Multi-omics,Development,Evoluti...",Bioinformatics,,201155246,Biological Sciences,"Bioinformatics,Health Research,Systematic Revi...","Systems pharmacology and information science,B...","LIFE201,LIFE202,LIFE208,LIFE210,LIFE213,LIFE22...",1
2,acsharp,"Anatomy,Biomechanics,Musculoskeletal biology,M...",Computational project,LIFE220,201409237,Biological Sciences,"Grant proposal,Meta-analysis,Health Research,I...","Musculoskeletal biology,Musculoskeletal ageing...","LIFE202,LIFE204,LIFE206,LIFE207,LIFE221,LIFE22...",1
3,njeffery,"Anatomy,Biomechanics,Neurobiology and Neuropat...",Data analysis,LIFE218,201590227,Anatomy and Human Biology,"Health Research,Grant proposal,Laboratory,Fiel...","Neurobiology and Neuropathology,Biomechanics,M...","LIFE204,LIFE205,LIFE218,LIFE219,LIFE220,LIFE22...",3
4,eayates,"Gastrointestinal infections,Drug safety,Host-m...",Laboratory,,201590418,Biological and Medical Sciences,"Laboratory,Data analysis,Health Research,Syste...","Drug safety,Clinical drug development,Gastroin...","LIFE202,LIFE204,LIFE206,LIFE207,LIFE221,LIFE22...",1


In [118]:
df.to_excel("allocation.xlsx", index=False)

In [83]:
student_project[1][337]

1

In [122]:
import numpy as np
from scipy.optimize import linprog

In [132]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def minimize_priority(student_preferences):
    n_students = len(student_preferences)
    max_project_index = max(max(prefs) for prefs in student_preferences) + 1
    
    # Create a cost matrix with a large enough size
    cost_matrix = np.full((n_students, max_project_index), np.inf)
    
    for student in range(n_students):
        for rank, project in enumerate(student_preferences[student]):
            cost_matrix[student, project] = rank
    
    # Solve the assignment problem
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
    # Create the assignment result
    assignment = [-1] * n_students
    for student, project in zip(row_ind, col_ind):
        if project < max_project_index:
            assignment[student] = project
    
    return assignment

allocation = minimize_priority(student_project)
print(allocation)

[211, 13, 218, 271, 170, 262, 100, 234, 362, 366, 89, 390, 296, 213, 173, 365, 49, 72, 333, 121, 159, 178, 348, 288, 139, 53, 230, 48, 172, 336, 37, 197, 102, 245, 60, 326, 148, 309, 428, 78, 239, 314, 58, 7, 199, 280, 219, 378, 17, 334, 308, 190, 181, 342, 335, 61, 35, 373, 3, 141, 240, 75, 235, 154, 220, 297, 214, 81, 107, 174, 318, 14, 354, 87, 152, 281, 120, 180, 338, 224, 176, 76, 155, 103, 376, 304, 9, 222, 193, 69, 80, 16, 275, 215, 311, 344, 319, 202, 253, 268, 287, 321, 339, 278, 150, 196, 223, 331, 51, 194, 171, 56, 10, 71, 192, 322, 79, 295, 303, 343, 111, 353, 158, 146, 20, 329, 43, 64, 177, 2, 221, 208, 145, 250, 147, 290, 337, 358, 205, 351, 68, 8, 371, 203, 27, 320, 243, 323, 34, 267, 349, 19, 375, 18, 63, 153, 66, 52, 99, 299, 289, 29, 285, 143, 161, 328, 298, 191, 357, 330, 247, 332, 274, 183, 254, 346, 315, 149, 266, 25, 361, 244, 395, 300, 352, 33, 24, 168, 324, 382, 84, 325, 169, 225, 291, 302, 62, 340, 258, 212, 307, 246, 30, 95, 305, 355, 313, 67, 367, 185, 59, 18

In [133]:
students.iloc[0]

student_id                                               123456789
programme                                             Biochemistry
types            Data analysis,GIS analysis,Grant proposal,Imag...
keywords         Molecular cell biology,Multi-omics,One health,...
prerequisites              LIFE203,LIFE206,LIFE210,LIFE213,LIFE216
Name: 0, dtype: object

In [134]:
projects.iloc[211]

username                                 kellyro
keywords        Evolution and adaptation,Ecology
types                                 Laboratory
prerequisite                             LIFE212
Name: 211, dtype: object

In [135]:
len(allocation)

335

In [42]:
import scipy
print(scipy.__version__)

1.11.4


In [46]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def create_problem_and_solve(A):
    A = np.array(A)  # Ensure A is a NumPy array
    # Use the Hungarian algorithm (Kuhn-Munkres algorithm) to find the optimal assignment
    row_ind, col_ind = linear_sum_assignment(A)
    
    # Create a matrix to represent the assignment
    allocation = np.zeros_like(A)
    allocation[row_ind, col_ind] = 1

    result = []

    for i, row in enumerate(allocation):
        # find the index of the 1 in the row
        idx = np.where(row == 1)[0][0]
        result.append([i, idx])

    return result

result = create_problem_and_solve(student_project)
print(result)

[[0, 374], [1, 337], [2, 233], [3, 194], [4, 73], [5, 252], [6, 4], [7, 297], [8, 191], [9, 390], [10, 409], [11, 307], [12, 41], [13, 188], [14, 399], [15, 29], [16, 232], [17, 308], [18, 408], [19, 299], [20, 44], [21, 121], [22, 220], [23, 238], [24, 172], [25, 240], [26, 282], [27, 101], [28, 203], [29, 30], [30, 198], [31, 309], [32, 248], [33, 199], [34, 281], [35, 195], [36, 93], [37, 124], [38, 368], [39, 50], [40, 353], [41, 346], [42, 415], [43, 133], [44, 24], [45, 403], [46, 289], [47, 18], [48, 280], [49, 286], [50, 347], [51, 300], [52, 94], [53, 367], [54, 407], [55, 206], [56, 249], [57, 138], [58, 272], [59, 58], [60, 366], [61, 391], [62, 157], [63, 214], [64, 303], [65, 55], [66, 143], [67, 112], [68, 357], [69, 139], [70, 318], [71, 231], [72, 342], [73, 323], [74, 372], [75, 236], [76, 292], [77, 137], [78, 217], [79, 13], [80, 316], [81, 247], [82, 242], [83, 201], [84, 171], [85, 28], [86, 2], [87, 360], [88, 120], [89, 301], [90, 284], [91, 47], [92, 196], [93, 

In [47]:
students.iloc[9]

student_id                                               201393620
programme                                   Bioveterinary Sciences
types            Laboratory,Field,Systematic Review,Image analy...
keywords         Ruminant health and welfare,Parasitology and v...
prerequisites      LIFE212,LIFE216,LIFE223,LIFE230,LIFE239,LIFE240
Name: 9, dtype: object

In [49]:
projects.iloc[390]

username                                               venera
keywords        Parasitology and vector borne disease,Ecology
types                                              Laboratory
prerequisite                                              NaN
Name: 390, dtype: object

In [32]:
def find_project_index(result):
    # return index of list where 1 is
    return [i for i, x in enumerate(result) if x == 1][0]

In [37]:
find_project_index(result[100])

241

In [40]:
projects.iloc[241]

username                                                 njeffery
keywords        Anatomy,Biomechanics,Neurobiology and Neuropat...
types                                               Data analysis
prerequisite                                              LIFE218
Name: 241, dtype: object

In [39]:
students.iloc[100]

student_id                                               201590227
programme                                Anatomy and Human Biology
types            Health Research,Grant proposal,Laboratory,Fiel...
keywords         Neurobiology and Neuropathology,Biomechanics,M...
prerequisites    LIFE204,LIFE205,LIFE218,LIFE219,LIFE220,LIFE22...
Name: 100, dtype: object