In [15]:
import pandas as pd
import numpy as np
from pulp import *

### (old) the p-median model with k nearest facilities

For this first stage, only k nearest facilities will be imported in the model.    
1. After importing original data, I create the new table only containing k nearest facilities.   
2. To test the method, I use the test IOE data as I used for capacitated p-median case.   
So in this model, the capacity constraint is still live.   
3. In a broader case, it should be that, the sum of demand value of the facility can serve is no more than its capacity.

In [16]:
# import data
time = pd.read_csv('data/example_subject_student_school_journeys.csv')
time_table = (
    time.pivot_table(
        columns="school",
        fill_value=10000,
        index="student",
        sort=False,
        values="time",
    )
    .astype(int)
    .values
)
students_df = pd.read_csv('data/example_subject_students.csv')
schools_df = pd.read_csv('data/example_subject_schools.csv')

In [17]:
# find the k nearest facility from distance matrix

# Define the value of k
k = 5

# Create an empty list to hold the rows
rows = []

# Iterate over each client point and add the k nearest facility indices to the new pivot table
for client_idx in range(time_table.shape[0]):
    distances = time_table[client_idx]
    nearest_index = np.argsort(distances)[:k]
    # Append rows to the list
    for i in nearest_index:
        row = {'client_id': client_idx, 'facility_id': i, 'distance': distances[i]}
        rows.append(row)

# Create a new_distance_df dataframe by concatenating the rows
new_distance_df = pd.DataFrame(rows, columns=['client_id', 'facility_id', 'distance'])

In [18]:
new_distance_df.head(10)

Unnamed: 0,client_id,facility_id,distance
0,0,68,53
1,0,49,55
2,0,67,56
3,0,32,59
4,0,66,60
5,1,52,18
6,1,42,46
7,1,41,48
8,1,43,49
9,1,50,55


In [19]:
# transform the new df to the pivot table
new_time_table = (
    new_distance_df.pivot_table(
        columns="facility_id",
        fill_value=10000,
        index="client_id",
        sort=False,
        values="distance",
    )
    .astype(int)
    .values
)

In [20]:
new_time_table[:4]

array([[10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,
        10000, 10000, 10000, 10000, 10000,    59, 10000, 10000, 10000,
        10000, 10000, 10000, 10000, 10000,    55, 10000, 10000, 10000,
        10000,    60,    56,    53, 10000],
       [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,
        10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,
        10000, 10000,    48,    46,    49, 10000,    55,    18, 10000,
        10000, 10000, 10000, 10000, 10000],
       [10000, 10000,     8, 10000, 10000, 10000,    15,    15,    13,
            8, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,
        10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,
        10000, 10000, 10000, 10000, 10000],
       [  130, 10000, 10000, 10000, 10000,   147, 10000, 10000, 10000,
        10000, 10000, 10000, 10000,   153, 10000, 10000,   152,   151,
        10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,
        10000, 1

In [21]:
# create capacity df
new_school_index =  new_distance_df['facility_id'].sort_values().unique()
new_school_df = pd.DataFrame({'facility_id': new_school_index})

new_school_df = new_school_df.merge(schools_df, left_on='facility_id',right_on='Unnamed: 0', how='left')
new_school_df.head(5)

Unnamed: 0.1,facility_id,Unnamed: 0,SE2 PP: Code,SE2 PP: PC,PL: Subject,Count,priority
0,1,1,IOE00044,NW11RX,Mathematics,1,3.0
1,2,2,IOE00045,NW23RT,Mathematics,1,3.0
2,5,5,IOE00128,E28LS,Mathematics,1,2.0
3,8,8,IOE00172,SW64UN,Mathematics,1,3.0
4,12,12,IOE00348,SE264RD,Mathematics,1,2.0


In [22]:
# set the parameter
school_indices = range(len(new_time_table[0]))
student_indices = range(len(new_time_table))

In [23]:
# p-median model considering k nearest facility

from pulp import *
import gurobipy

prob = pulp.LpProblem("k-nearest", LpMinimize)

# create decision variable: whether student i is assigned to school j
decision = LpVariable.dicts("x", ((i, j) for i in student_indices for j in school_indices), 0, 1, LpBinary)

# set the objective function to minimize the total distance travelled
objective = pulp.lpSum(
    pulp.lpSum(decision[i,j] * new_time_table[i,j] for j in school_indices) 
    for i in student_indices)
prob += objective

# add all the constraints

# 1. Each client is assigned to a facility
for i in student_indices:
    prob +=  pulp.lpSum(decision[i,j] for j in school_indices) == 1

# 2. Demand value the facility can serve is no more than its capacity.
for j in school_indices:
    prob +=  pulp.lpSum(decision[i,j] for i in student_indices) <= new_school_df['Count'][j]
    
# solve the problem
prob.solve(pulp.PULP_CBC_CMD(msg=False))

1

In [24]:
for i in student_indices:
    for j in school_indices:
        if decision[i,j].value() == 1:
            print(i,j)

0 30
1 25
2 9
3 0
4 3
5 29
6 23
7 17
8 27
9 31


### (old) second stage: find the infeasible K and implement the extra facility

In [25]:
# define the function to get the new distance matrix only with k nearest facilities
def get_k_facilities(k, distance_array):

    # Create an empty list to hold the rows
    rows = []

    # Iterate over each client point and add the k nearest facility indices to the new pivot table
    for client_idx in range(distance_array.shape[0]):
        distances = distance_array[client_idx]
        nearest_index = np.argsort(distances)[:k]
        # Append rows to the list
        for i in nearest_index:
            row = {'client_id': client_idx, 'facility_id': i, 'distance': distances[i]}
            rows.append(row)

    # Create a new_distance_df dataframe by concatenating the rows
    new_distance_df = pd.DataFrame(rows, columns=['client_id', 'facility_id', 'distance'])

    # create new distance matrix
    new_distance_array = (
    new_distance_df.pivot_table(
        columns="facility_id",
        fill_value=10000,
        index="client_id",
        sort=False,
        values="distance",
    )
    .astype(int)
    .values
    )

    return new_distance_df, new_distance_array

In [26]:
# define the function to get the capacity df
def get_capacity(new_distance_df,schools_df):

    new_school_index = new_distance_df['facility_id'].sort_values().unique()
    new_school_df = pd.DataFrame({'facility_id': new_school_index})

    new_school_df = new_school_df.merge(schools_df, left_on='facility_id',right_on='Unnamed: 0', how='left')
    return new_school_df

#### (old) try k = 2

In [27]:
# try k = 2
distance_2_df, distance_2_arr = get_k_facilities(2, time_table)
school_2_df = get_capacity(distance_2_df, schools_df)

In [28]:
# p-median model

prob_2 = pulp.LpProblem("k-nearest-2", LpMinimize)

# set the parameter
school_2_indices = range(len(distance_2_arr[0]))

# create decision variable: whether student i is assigned to school j
decision_2 = LpVariable.dicts("x", ((i, j) for i in student_indices for j in school_2_indices), 0, 1, LpBinary)

# set the objective function to minimize the total distance travelled
objective_2 = pulp.lpSum(
    pulp.lpSum(decision_2[i,j] * distance_2_arr[i,j] for j in school_2_indices) 
    for i in student_indices)
prob_2 += objective_2

# add all the constraints
# 1. Each client is assigned to a facility
for i in student_indices:
    prob_2 +=  pulp.lpSum(decision_2[i,j] for j in school_2_indices) == 1

# 2. Demand value the facility can serve is no more than its capacity.
for j in school_2_indices:
    prob_2 +=  pulp.lpSum(decision_2[i,j] for i in student_indices) <= school_2_df['Count'][j]
    
# solve the problem
prob_2.solve(pulp.PULP_CBC_CMD(msg=False))

1

In [29]:
distance_2_df

Unnamed: 0,client_id,facility_id,distance
0,0,68,53
1,0,49,55
2,1,52,18
3,1,42,46
4,2,5,8
5,2,19,8
6,3,1,130
7,3,13,147
8,4,21,83
9,4,8,83


In [30]:
for i in student_indices:
    for j in school_2_indices:
        if decision_2[i,j].value() == 1:
            print(i,j)

0 16
1 13
2 2
3 0
4 6
5 15
6 11
7 7
8 14
9 17


So, k = 2 has the optimal solution. k = 1 should have the infeasible result.

#### (old) try k = 1

In [31]:
distance_1_df, distance_1_arr = get_k_facilities(1, time_table)
school_1_df = get_capacity(distance_1_df, schools_df)

# p-median
prob_1 = pulp.LpProblem("k-nearest-1", LpMinimize)
school_1_indices = range(len(distance_1_arr[0]))

decision_1 = LpVariable.dicts("x", ((i, j) for i in student_indices for j in school_1_indices), 0, 1, LpBinary)

objective_1 = pulp.lpSum(
    pulp.lpSum(decision_1[i,j] * distance_1_arr[i,j] for j in school_1_indices) 
    for i in student_indices)
prob_1 += objective_1

# 1. Each client is assigned to a facility
for i in student_indices:
    prob_1 +=  pulp.lpSum(decision_1[i,j] for j in school_1_indices) == 1

# 2. Demand value the facility can serve is no more than its capacity.
for j in school_1_indices:
    prob_1 +=  pulp.lpSum(decision_1[i,j] for i in student_indices) <= school_1_df['Count'][j]
    
prob_1.solve(pulp.PULP_CBC_CMD(msg=False))

1

In [32]:
for i in student_indices:
    for j in school_1_indices:
        if decision_1[i,j].value() == 1:
            print(i,j)

0 7
1 5
2 1
3 0
4 2
5 6
6 4
7 3
8 2
9 8


In [33]:
school_1_index =  distance_1_df['facility_id'].sort_values().unique()
school_1_index

array([ 1,  5, 21, 36, 49, 52, 67, 68, 70])

In [34]:
school_1_index[2]

21

In [35]:
distance_1_df

Unnamed: 0,client_id,facility_id,distance
0,0,68,53
1,1,52,18
2,2,5,8
3,3,1,130
4,4,21,83
5,5,67,39
6,6,49,78
7,7,36,102
8,8,52,79
9,9,70,96


In [36]:
distance_1_arr

array([[10000, 10000, 10000, 10000, 10000, 10000, 10000,    53, 10000],
       [10000, 10000, 10000, 10000, 10000,    18, 10000, 10000, 10000],
       [10000,     8, 10000, 10000, 10000, 10000, 10000, 10000, 10000],
       [  130, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000],
       [10000, 10000,    83, 10000, 10000, 10000, 10000, 10000, 10000],
       [10000, 10000, 10000, 10000, 10000, 10000,    39, 10000, 10000],
       [10000, 10000, 10000, 10000,    78, 10000, 10000, 10000, 10000],
       [10000, 10000, 10000,   102, 10000, 10000, 10000, 10000, 10000],
       [10000, 10000, 10000, 10000, 10000,    79, 10000, 10000, 10000],
       [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,    96]])

In [37]:
print(round(prob_1.objective.value(), 3))

10607.0


The problem is that it still takes one 10000 as the result, which is not allowed.   

Here my solution is to add one constraint, to prevent the use of 10000 in the solution.

In [38]:
# p-median with constraint to prevent the use of 10000
prob_1_new = pulp.LpProblem("k-nearest-1-new-constraint", LpMinimize)

decision_1_prevent = LpVariable.dicts("x", ((i, j) for i in student_indices for j in school_1_indices), 0, 1, LpBinary)

objective_1_prevent = pulp.lpSum(
    pulp.lpSum(decision_1_prevent[i,j] * distance_1_arr[i,j] for j in school_1_indices) 
    for i in student_indices)
prob_1_new += objective_1_prevent

# 1. Each client is assigned to a facility
for i in student_indices:
    prob_1_new +=  pulp.lpSum(decision_1_prevent[i,j] for j in school_1_indices) == 1

# 2. Demand value the facility can serve is no more than its capacity.
for j in school_1_indices:
    prob_1_new +=  pulp.lpSum(decision_1_prevent[i,j] for i in student_indices) <= school_1_df['Count'][j]

# 3. To prevent the use of 10000
for i in student_indices:
    for j in school_1_indices:
        if distance_1_arr[i,j] == 10000:
            prob_1_new += decision_1_prevent[i,j] == 0

prob_1_new.solve(pulp.PULP_CBC_CMD(msg=False))

-1

#### (old) add decision variable for placeholder/extra facility 

Since k = 1 is infeasible, we use this case to implement the extra facility.

In [39]:
# get the new distance matrix with k nearest facilities, and the rest facilities


In [40]:
k_facility_number = len(distance_1_df['facility_id'].unique())
print(k_facility_number)

9


In [41]:
distance_1_df['facility_id']

0    68
1    52
2     5
3     1
4    21
5    67
6    49
7    36
8    52
9    70
Name: facility_id, dtype: int64

In [42]:
# Get the complement of facility_id in distance_1_df
extra_facility_ids = [i for i in range(len(time_table[0])) if i not in distance_1_df['facility_id'].values]
extra_facility_ids = np.array(extra_facility_ids)
extra_facility_ids

array([ 0,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37,
       38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 53, 54, 55, 56,
       57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 69, 71, 72, 73])

In [43]:
len(extra_facility_ids)

65

In [44]:
len(time_table[0])

74

In [45]:
extra_distance_matrix = time_table[:, extra_facility_ids]
extra_distance_matrix[0]

array([ 83,  78,  70, 117,  84,  86,  78,  75,  83, 107, 122,  97,  95,
       112,  88,  91,  86,  88,  88,  75,  71, 109, 125, 115,  79,  93,
        97, 136, 133,  59,  77,  63, 138, 124, 113, 110, 114, 100, 100,
        78,  92,  86,  84,  89,  82,  65, 119, 116, 107, 116, 117, 111,
        98, 100,  99, 109, 117, 124, 116, 181,  60,  95,  84,  89, 115])

In [46]:
# get the capacity of extra facilities
school_extra_df = pd.DataFrame({'facility_id': extra_facility_ids})
school_extra_df = school_extra_df.merge(schools_df, left_on='facility_id',right_on='Unnamed: 0', how='left')

In [47]:
# create the new distance matrix of k nearest facilities
# in the previous one, the non-k-nearest facilities have 10,000 as the distance
# now we need to use the real distance
facility_ids_array = distance_1_df['facility_id'].sort_values().values
k_distance_matrix = time_table[:, facility_ids_array]
k_distance_matrix

array([[ 64,  85,  97, 110,  55,  90,  90,  56,  53,  79],
       [ 76,  80,  96, 100,  61,  18,  18, 119, 106,  60],
       [ 19,   8,  51,  68, 110, 129, 129, 163, 118,  36],
       [130, 160, 169, 151, 184, 208, 208, 201, 208, 161],
       [110, 129,  83,  93, 143, 156, 156, 153, 163, 107],
       [ 52,  70,  93,  84,  73,  86,  86,  39,  59,  61],
       [101, 114, 128, 131,  78,  97,  97, 127, 120,  90],
       [125, 131, 119, 102, 150, 173, 173, 187, 183, 128],
       [113, 120, 133, 144, 108,  79,  79, 157, 180,  93],
       [118, 130, 146, 147, 133, 112, 112, 205, 182,  96]])

In [48]:
facility_ids_array

array([ 1,  5, 21, 36, 49, 52, 52, 67, 68, 70])

In [49]:
distance_1_arr

array([[10000, 10000, 10000, 10000, 10000, 10000, 10000,    53, 10000],
       [10000, 10000, 10000, 10000, 10000,    18, 10000, 10000, 10000],
       [10000,     8, 10000, 10000, 10000, 10000, 10000, 10000, 10000],
       [  130, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000],
       [10000, 10000,    83, 10000, 10000, 10000, 10000, 10000, 10000],
       [10000, 10000, 10000, 10000, 10000, 10000,    39, 10000, 10000],
       [10000, 10000, 10000, 10000,    78, 10000, 10000, 10000, 10000],
       [10000, 10000, 10000,   102, 10000, 10000, 10000, 10000, 10000],
       [10000, 10000, 10000, 10000, 10000,    79, 10000, 10000, 10000],
       [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000,    96]])

In [50]:
# p-median with all the facilities
prob_all = pulp.LpProblem("k-nearest-all-facilities", LpMinimize)

# set parameter
school_extra_indices = range(len(extra_distance_matrix[0]))
decision_extra = LpVariable.dicts("x_extra", ((i, j) for i in student_indices for j in school_extra_indices), 0, 1, LpBinary)
decision_k_in_all = LpVariable.dicts("x_k", ((i, j) for i in student_indices for j in school_1_indices), 0, 1, LpBinary)

# set objective
objective_all = (
    pulp.lpSum(
        pulp.lpSum(decision_k_in_all[i, j] * k_distance_matrix[i, j] for j in school_1_indices)
        for i in student_indices
    ) +
    pulp.lpSum(
        pulp.lpSum(decision_extra[i, j] * extra_distance_matrix[i, j] for j in school_extra_indices)
        for i in student_indices
    )
)
prob_all += objective_all

# 1. Each client is assigned to a facility
for i in student_indices:
    prob_all += (
        pulp.lpSum(decision_k_in_all[i, j] for j in school_1_indices)
        + pulp.lpSum(decision_extra[i, j] for j in school_extra_indices)
        == 1
    )

# 2. Demand value the facility can serve is no more than its capacity.
for j in school_1_indices:
    prob_all +=  pulp.lpSum(decision_k_in_all[i,j] for i in student_indices) <= school_1_df['Count'][j]
for j in school_extra_indices:
    prob_all +=  pulp.lpSum(decision_extra[i,j] for i in student_indices) <= school_extra_df['Count'][j]

#prob_all.solve(pulp.PULP_CBC_CMD(msg=False))
prob_all.solve(GLPK(msg=False))


1

In [51]:
prob_all.status

1

status = 1: The problem was solved to optimality, and an optimal solution was found.   
status = 0: The problem is feasible, but the solver was not able to prove that the solution is optimal.   
status = -1: The problem is infeasible; no feasible solution exists.   
status = -2: The problem is unbounded; the objective function can be improved infinitely.   
status = -3: The solver encountered an error or was unable to solve the problem.    

In [52]:
for (i, j) in decision_extra.keys():
    value = decision_extra[(i, j)].varValue
    if value == 1:
        print("decision_extra[{}, {}] = {}".format(i, j, value))

    # Access the optimal values of decision_k_in_all
for (i, j) in decision_k_in_all.keys():
    value = decision_k_in_all[(i, j)].varValue
    if value == 1:
        print("decision_k_in_all[{}, {}] = {}".format(i, j, value))

decision_extra[2, 17] = 1
decision_extra[4, 6] = 1
decision_extra[9, 37] = 1
decision_k_in_all[0, 8] = 1
decision_k_in_all[1, 6] = 1
decision_k_in_all[3, 0] = 1
decision_k_in_all[5, 7] = 1
decision_k_in_all[6, 4] = 1
decision_k_in_all[7, 3] = 1
decision_k_in_all[8, 5] = 1


Another condition is when do extra facility is selected, but the k-nearest facility of other client is selected.   

For example, when k = 2, client `a` has facility `1` and `2` as the nearest facility, client `b` has facility `3` and `4`.   

It's possible that in the optimal solution client `b` is assigned to facility `1`, which is not its nearest facility.

### (please check this part) p-median with k facilities 

In [76]:
# to get the nearest facility of client, k = 1
from scipy.spatial import KDTree
k_pair = []
for i in range(len(time_table)):
    tree = KDTree(time_table[i].reshape(-1, 1))
    distance, indices = tree.query(0, k=1)
    k_pair.append([i, indices])

In [77]:
k_pair

[[0, 68],
 [1, 52],
 [2, 19],
 [3, 1],
 [4, 8],
 [5, 67],
 [6, 49],
 [7, 36],
 [8, 52],
 [9, 70]]

In [82]:
time_table[0]

array([ 83,  64,  78,  70, 117,  85,  84,  86,  78,  75,  83, 107, 122,
        97,  95, 112,  88,  91,  86,  88,  88,  97,  75,  71, 109, 125,
       115,  79,  93,  97, 136, 133,  59,  77,  63, 138, 110, 124, 113,
       110, 114, 100, 100,  78,  92,  86,  84,  89,  82,  55,  65, 119,
        90, 116, 107, 116, 117, 111,  98, 100,  99, 109, 117, 124, 116,
       181,  60,  56,  53,  95,  79,  84,  89, 115])

In [80]:
# p-median with k facilities
problem_k = pulp.LpProblem("new-k-nearest-facilities", LpMinimize)

decision_of_k = LpVariable.dicts("x_of_k", ((i, j) for i, j in k_pair), 0, 1, LpBinary)

objective_of_k = pulp.lpSum(
    pulp.lpSum(decision_of_k.get((i,j), 0) * time_table[i,j] for j in [row[1] for row in k_pair]) 
    for i in student_indices)
problem_k += objective_of_k

# 1. Each client is assigned to a facility
for i in student_indices:
    problem_k +=  pulp.lpSum(decision_of_k.get((i,j), 0) for j in [row[1] for row in k_pair]) == 1

# 2. Demand value the facility can serve is no more than its capacity.
for j in [row[1] for row in k_pair]:
    problem_k +=  pulp.lpSum(decision_of_k.get((i,j), 0) for i in student_indices) <= schools_df['Count'][j]
    
problem_k.solve(pulp.PULP_CBC_CMD(msg=False))

-1

In [81]:
decision_of_k

{(0, 68): x_of_k_(0,_68),
 (1, 52): x_of_k_(1,_52),
 (2, 19): x_of_k_(2,_19),
 (3, 1): x_of_k_(3,_1),
 (4, 8): x_of_k_(4,_8),
 (5, 67): x_of_k_(5,_67),
 (6, 49): x_of_k_(6,_49),
 (7, 36): x_of_k_(7,_36),
 (8, 52): x_of_k_(8,_52),
 (9, 70): x_of_k_(9,_70)}