In [1]:
import sys
sys.path.insert(1, '../') 

In [2]:
! pip install mip



In [3]:
import os
if not os.path.exists('./results'): 
    os.mkdir('./results')

In [4]:
from util import simulate_x
import numpy as np
import matplotlib.pyplot as plt
from optimal_sizes import optimal_column_weight, entropy
from membership_matrix import generate_doubly_regular_col
from test import test_M
import json 

In [5]:
num_trials = 1000

# Column Weights Analysis
In this notebook, we will investigate the effect of column weights on the performance of pooling matrices. 

In [6]:
T = 48
n = 384

fnr = 0
fpr = 0 

max_column_weight = 10
ks = list(range(1, 10)) 

In [None]:
# get average accuracy 
results = {} 
column_weights = list(range(1, max_column_weight + 1)) 

for k in ks: 
    # simulate data 
    simulate_x(n, k/n, num_trials)
    
    # get average accuracy for different column weights 
    average_accuracy = []
    for column_weight in column_weights: 
        print("Starting column weight = %s ..." % column_weight)
        M = generate_doubly_regular_col((T, n), column_weight)
        info = test_M(M, k/n, n, fpr, fnr, num_trials=num_trials)
        average_accuracy.append(np.average(info["accuracy"]))
        
    results[k] = average_accuracy 
  
# save results 
with open("./results/column-weights-n%s-T%s.txt" % (n, T), 'w') as outfile:
    json.dump(results, outfile) 

On average, 1.04 positives in each trail.
Starting column weight = 1 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 2 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 3 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 4 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 5 ...
Finished trial 100
Finished trial 200
F

Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 8 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 9 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 10 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
On average, 4.91 positives in each trail.
Starting column weight = 1 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished tri

Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 4 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 5 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 6 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 800
Finished trial 900
Finished trial 1000
Starting column weight = 7 ...
Finished trial 100
Finished trial 200
Finished trial 300
Finished trial 400
Finished trial 500
Finished trial 600
Finished trial 700
Finished trial 80

In [None]:
# plot 
fig, ax = plt.subplots() 
for k in ks: 
    average_accuracy = results[k]
    
    empirical_index = average_accuracy.index(np.max(average_accuracy))
    empirical_m = column_weights[empirical_index]
    
    theoretical_m = optimal_column_weight(k/n, fnr, fpr, T, n) 
    
    if theoretical_m > max_column_weight:
        theoretical_m = max_column_weight
    
    theoretical_index = column_weights.index(theoretical_m) 

    print("theoretical optimal column weight for k = %s is %s." % (k, theoretical_m))
    print("empirical optimal column weight for k = %s is %s." % (k, empirical_m)) 
    print("=" * 20)  

    ax.plot(column_weights, average_accuracy, label="k=%s"%k)  

    ax.scatter(empirical_m, average_accuracy[empirical_index], marker='o', color='black')  
    ax.scatter(theoretical_m, average_accuracy[theoretical_index], marker='x', color='black')
    
ax.legend()
plt.xlabel("Column Weights") 
plt.ylabel("Average Accuracy")
plt.title("Effect of Column Weights")

plt.savefig("./results/column-weights-n%s-T%s.png" % (n, T))
plt.show()

In [None]:
# entropy plot 
fig, ax = plt.subplots() 
for k in ks: 
    average_accuracy = results[k]

    row_weights = [column_weight * n / T for column_weight in column_weights]
    entropies = [entropy(row_weight, k/n) for row_weight in row_weights]
    
    x, y = zip(*sorted(zip(entropies, average_accuracy)))
    ax.scatter(x, y)
    ax.plot(x, y, label="k=%s"%k) 
    print("Correlation for k=%s: %.2f" % (k, np.corrcoef(x, y)[0][1]))

ax.legend()
plt.xlabel("Entropy of Each Pool") 
plt.ylabel("Average Accuracy")
plt.title("Effect of Entropy")

plt.savefig("./results/entropy-n%s-T%s.png" % (n, T))
plt.show()