Note: this notebook has been modified specifically for use on the oxford data set!

In [1]:
import csv
import pandas as pd
import numpy as np
import random
import csv

## 1. Preprocessing
### Create data

In [4]:
k_sev=1376
k_non_sev=4324
sev_p=[0.884,0.711,0.603,0.442,0.376,0.357,0.26,0.26,0.161,0.113,0.078,
       0.059,0.057,0.028]
non_sev_p=[0.814,0.657,0.442,0.057,0.28,0.128,0.131,0.109,0.121,0.135,0.097,
           0.057,0.058,0.051]
sev_n=[round(k_sev*i) for i in sev_p]
non_sev_n=[round(k_non_sev*i) for i in non_sev_p]
naive_r = [(sev_n[i]+non_sev_n[i])/5700 for i in range(len(non_sev_p))]
print(naive_r)
data = [sev_n, non_sev_n]
symptoms = ["Fever", "Cough", "Fatigue", "Dyspnea", "Sputum", "shortness", "Myalgia", "Chill", "Dizziness", "Headache", "sore", "Nausea", "Diarhea", "Congestion"]
covid_df = pd.DataFrame(data, columns = symptoms)

print(covid_df)

[0.8308771929824561, 0.67, 0.4808771929824561, 0.14982456140350878, 0.3031578947368421, 0.1831578947368421, 0.16210526315789472, 0.14543859649122806, 0.1307017543859649, 0.12964912280701754, 0.09228070175438596, 0.057368421052631575, 0.05771929824561404, 0.0456140350877193]
   Fever  Cough  Fatigue  Dyspnea  Sputum  shortness  Myalgia  Chill  \
0   1216    978      830      608     517        491      358    358   
1   3520   2841     1911      246    1211        553      566    471   

   Dizziness  Headache  sore  Nausea  Diarhea  Congestion  
0        222       155   107      81       78          39  
1        523       584   419     246      251         221  


Assume severe patients have a rating of around 2 

In [5]:
# multiply all values by scalar, with sd, around normal dist
def norm(center, sd):
    rating = np.abs(np.random.normal(center, sd)) 
    if rating > 10: rating = 10
    return rating

In [6]:
big_sev_list = []
big_non_sev_list = []

for symptom in symptoms:
    ones = [norm(8, 2) for i in range(covid_df[symptom][0])]
    zeroes = [0 for i in range(k_sev - covid_df[symptom][0])]
    new_col = ones + zeroes
    random.shuffle(new_col)
    big_sev_list.append(new_col)
    
for symptom in symptoms:
    ones = [norm(3,1) for i in range(covid_df[symptom][1])]
    zeroes = [0 for i in range(k_non_sev - covid_df[symptom][1])]
    new_col = ones + zeroes
    random.shuffle(new_col)
    big_non_sev_list.append(new_col)
    
big_sev_array = np.array(big_sev_list)
big_non_sev_array = np.array(big_non_sev_list)

big_sev_array = np.transpose(big_sev_array)
big_non_sev_array = np.transpose(big_non_sev_array)

print(big_sev_array.shape)
print(big_non_sev_array.shape)
big_data = np.concatenate((big_sev_array, big_non_sev_array), axis=0)
print(big_data.shape)
covid_df = pd.DataFrame(big_data, columns = symptoms)
print(covid_df.head)

#save final version

(1376, 14)
(4324, 14)
(5700, 14)
<bound method NDFrame.head of          Fever      Cough   Fatigue   Dyspnea    Sputum  shortness   Myalgia  \
0     7.669867  10.000000  0.000000  5.325725  0.000000   8.707343  0.000000   
1     6.233120   7.627197  0.000000  0.000000  0.000000   0.000000  0.000000   
2     7.415830   8.961084  6.514873  0.000000  0.000000   0.000000  2.724147   
3     8.527744   0.000000  5.720490  6.041349  0.000000   0.000000  4.646532   
4     7.028445   0.000000  0.000000  4.889083  9.483943   0.000000  5.718689   
...        ...        ...       ...       ...       ...        ...       ...   
5695  2.332027   2.839166  0.000000  0.000000  2.533438   0.000000  0.000000   
5696  4.250155   1.370701  0.000000  0.000000  0.000000   0.000000  0.000000   
5697  2.288778   3.342014  3.443112  0.000000  0.000000   0.000000  0.000000   
5698  3.387541   2.691077  0.000000  0.000000  0.000000   0.000000  0.000000   
5699  3.412796   0.000000  2.265005  4.014707  4.460321  

In [7]:
covid_df.to_csv('data/generated-data.csv')

## 2. Graph Building
14 nodes

In [8]:
edges = []
for i in range(len(symptoms)):
        for j in range(i +1, len(symptoms)):
            edges.append((i,j))
triangles = []
for i in range(len(symptoms)):
        for j in range(i +1, len(symptoms)):
            for h in range(j + 1, len(symptoms)):
                triangles.append((i,j, h))
                
adj_matrix = np.zeros((len(symptoms), len(symptoms)))
curl = np.zeros((len(edges), len(triangles)))
neg_divergence = np.zeros((len(edges), len(symptoms)))
f = np.zeros((len(edges)))
W = np.zeros((len(edges), len(edges)))

In [9]:
# f, w
for index, row in covid_df.iterrows():
    for i, edge in enumerate(edges):
        W[i, i] += 1
        f[i] += (row[symptoms[edge[1]]] - row[symptoms[edge[0]]])
for i in range(len(edges)):
    f[i] = f[i]*1/W[i, i]

In [10]:
print(f)

print(W)

[-7.03135259e-01 -1.38927418e+00 -2.56656686e+00 -2.18691875e+00
 -2.56487733e+00 -2.74347801e+00 -2.78319800e+00 -2.96542716e+00
 -3.00407802e+00 -3.17319180e+00 -3.29994596e+00 -3.28901735e+00
 -3.36565468e+00 -6.86138924e-01 -1.86343160e+00 -1.48378349e+00
 -1.86174207e+00 -2.04034275e+00 -2.08006274e+00 -2.26229190e+00
 -2.30094276e+00 -2.47005654e+00 -2.59681070e+00 -2.58588209e+00
 -2.66251942e+00 -1.17729268e+00 -7.97644563e-01 -1.17560315e+00
 -1.35420383e+00 -1.39392382e+00 -1.57615297e+00 -1.61480384e+00
 -1.78391761e+00 -1.91067178e+00 -1.89974317e+00 -1.97638049e+00
  3.79648112e-01  1.68952884e-03 -1.76911152e-01 -2.16631143e-01
 -3.98860298e-01 -4.37511161e-01 -6.06624937e-01 -7.33379104e-01
 -7.22450495e-01 -7.99087819e-01 -3.77958584e-01 -5.56559265e-01
 -5.96279256e-01 -7.78508410e-01 -8.17159273e-01 -9.86273049e-01
 -1.11302722e+00 -1.10209861e+00 -1.17873593e+00 -1.78600681e-01
 -2.18320672e-01 -4.00549827e-01 -4.39200690e-01 -6.08314466e-01
 -7.35068633e-01 -7.24140

In [11]:
# adj_matrix
for index, row in covid_df.iterrows():
    for i in range(len(symptoms)):
        for j in range(i +1, len(symptoms)):
            adj_matrix[i, j] += (row[symptoms[i]] - row[symptoms[j]])/len(covid_df) #divide by #ppl with both symptoms
print(adj_matrix)

[[ 0.00000000e+00  7.03135259e-01  1.38927418e+00  2.56656686e+00
   2.18691875e+00  2.56487733e+00  2.74347801e+00  2.78319800e+00
   2.96542716e+00  3.00407802e+00  3.17319180e+00  3.29994596e+00
   3.28901735e+00  3.36565468e+00]
 [ 0.00000000e+00  0.00000000e+00  6.86138924e-01  1.86343160e+00
   1.48378349e+00  1.86174207e+00  2.04034275e+00  2.08006274e+00
   2.26229190e+00  2.30094276e+00  2.47005654e+00  2.59681070e+00
   2.58588209e+00  2.66251942e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.17729268e+00
   7.97644563e-01  1.17560315e+00  1.35420383e+00  1.39392382e+00
   1.57615297e+00  1.61480384e+00  1.78391761e+00  1.91067178e+00
   1.89974317e+00  1.97638049e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -3.79648112e-01 -1.68952884e-03  1.76911152e-01  2.16631143e-01
   3.98860298e-01  4.37511161e-01  6.06624937e-01  7.33379104e-01
   7.22450495e-01  7.99087819e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  

In [12]:
# neg_divergence
for i in range(len(edges)):
    for j in range(len(symptoms)):
        if edges[i][0] == j:
            neg_divergence[i,j] = -1
        elif edges[i][1] == j:
            neg_divergence[i,j] = 1
print(neg_divergence)

[[-1.  1.  0. ...  0.  0.  0.]
 [-1.  0.  1. ...  0.  0.  0.]
 [-1.  0.  0. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ... -1.  1.  0.]
 [ 0.  0.  0. ... -1.  0.  1.]
 [ 0.  0.  0. ...  0. -1.  1.]]


In [13]:
# curl
for j, tri in enumerate(triangles):
    for i, edge in enumerate(edges):
        if edge[0] in tri and edge[1] in tri:
            first_edge = tri.index(edge[0])
            if (first_edge + 1) % 3 == tri.index(edge[1]):
                curl[i, j] = 1
            else:
                curl[i,j] = -1
print(curl)

[[ 1.  1.  1. ...  0.  0.  0.]
 [-1.  0.  0. ...  0.  0.  0.]
 [ 0. -1.  0. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  1.]
 [ 0.  0.  0. ...  1.  0. -1.]
 [ 0.  0.  0. ...  0.  1.  1.]]


### Solving for r

In [14]:
right_side = np.matmul(np.transpose(neg_divergence), np.matmul(W, f))
left_side = np.matmul(np.matmul(np.transpose(neg_divergence), W), neg_divergence)
r = np.matmul(np.linalg.pinv(left_side), right_side)
print(r)

[ 2.43105453  1.72791927  1.04178034 -0.13551233  0.24413578 -0.1338228
 -0.31242349 -0.35214348 -0.53437263 -0.57302349 -0.74213727 -0.86889144
 -0.85796283 -0.93460015]


In [15]:
rank_df = pd.DataFrame({
    'symptoms': symptoms,
    'r': r
})
print(rank_df)

rank_df = rank_df.sort_values(by =['r'],  ascending = False)
rank_df = rank_df.reset_index(drop = True)
print(rank_df)

rank_df.to_csv('data/hodge_ranking.csv')


sum = 0
for i in range(len(edges)):
    to_add = W[i, i]*(f[i] + (r[edges[i][0]] - r[edges[i][1]]))**2
    sum += to_add
    print("error at ", i,": ", to_add)
print( sum)

      symptoms         r
0        Fever  2.431055
1        Cough  1.727919
2      Fatigue  1.041780
3      Dyspnea -0.135512
4       Sputum  0.244136
5    shortness -0.133823
6      Myalgia -0.312423
7        Chill -0.352143
8    Dizziness -0.534373
9     Headache -0.573023
10        sore -0.742137
11      Nausea -0.868891
12     Diarhea -0.857963
13  Congestion -0.934600
      symptoms         r
0        Fever  2.431055
1        Cough  1.727919
2      Fatigue  1.041780
3       Sputum  0.244136
4    shortness -0.133823
5      Dyspnea -0.135512
6      Myalgia -0.312423
7        Chill -0.352143
8    Dizziness -0.534373
9     Headache -0.573023
10        sore -0.742137
11     Diarhea -0.857963
12      Nausea -0.868891
13  Congestion -0.934600
error at  0 :  1.8997742749985017e-25
error at  1 :  2.203288508282286e-25
error at  2 :  3.6421707994054115e-25
error at  3 :  1.1241267899399418e-25
error at  4 :  2.529285277364869e-25
error at  5 :  2.203288508282286e-25
error at  6 :  3.64217079

In [17]:

sum = 0
for i in range(len(edges)):
    to_add = W[i, i]*(f[i] + (naive_r[edges[i][0]] - naive_r[edges[i][1]]))**2
    sum += to_add
    print("error at ", i,": ", to_add)
print( sum)

error at  0 :  1676.0497195923822
error at  1 :  6156.517715126091
error at  2 :  20264.434234606495
error at  3 :  15691.774007315475
error at  4 :  20950.32103439639
error at  5 :  24535.11033709575
error at  6 :  25083.388785495277
error at  7 :  29248.78246824502
error at  8 :  30227.7719680684
error at  9 :  33785.349486748244
error at  10 :  36382.4438192666
error at  11 :  36078.428272977486
error at  12 :  37952.996248635434
error at  13 :  1408.0426149041036
error at  14 :  10284.721548142496
error at  15 :  7111.08088624594
error at  16 :  10774.994514063006
error at  17 :  13385.862430409627
error at  18 :  13791.631174570393
error at  19 :  16921.630603296984
error at  20 :  17668.197538227607
error at  21 :  20411.359268374425
error at  22 :  22440.710756227327
error at  23 :  22202.084017331552
error at  24 :  23677.73144932817
error at  25 :  4081.89661139196
error at  26 :  2190.5518057784943
error at  27 :  4392.876295926791
error at  28 :  6111.079528407219
error at  

### Solving for c

In [None]:
right_side = np.matmul(np.transpose(curl), np.matmul(W, f))
left_side = np.matmul(np.matmul(np.transpose(curl), W), curl)
c = np.matmul(np.linalg.inv(left_side), right_side)
print(c)