## Imports

In [1]:
import dgl
import numpy as np
import pandas as pd
import networkx as nx
import random
import itertools

from helper_functions import build_map_1

Using backend: pytorch


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv

## Creating the graph:

In [3]:
# CONSTANTS

n_states = 34

In [4]:
G = dgl.DGLGraph()
G.add_nodes(n_states)
u, v = build_map_1()
G = dgl.graph((u, v))



In [5]:
print(G.number_of_nodes())
print(G.number_of_edges())
print(G)

34
128
Graph(num_nodes=34, num_edges=128,
      ndata_schemes={}
      edata_schemes={})


In [6]:
nx_G = G.to_networkx().to_undirected()
pos = nx.kamada_kawai_layout(nx_G)

nx.draw(nx_G, pos, with_labels=True, node_color=[[.9, .3, .3]])

In [7]:
import json
cases = json.load(open("./data/alldata.json", "r"))

In [8]:
states_ = open('./data/state_code_mapping.txt', 'r').read()

In [9]:
name_map_dict = dict([[s.rsplit(" ", 1)[0], s.rsplit(" ", 1)[1]] for s in states_.split('\n')])

## Loading and pre-processing the data:

In [10]:
cases.keys()

dict_keys(['AN', 'AP', 'AR', 'AS', 'BR', 'CH', 'CT', 'DL', 'DN', 'GA', 'GJ', 'HP', 'HR', 'JH', 'JK', 'KA', 'KL', 'LA', 'LD', 'MH', 'ML', 'MN', 'MP', 'MZ', 'NL', 'OR', 'PB', 'PY', 'RJ', 'SK', 'TG', 'TN', 'TR', 'TT', 'UN', 'UP', 'UT', 'WB'])

In [11]:
# manually found this online since data wasn't present in the file
cases['LA']['dates']['2020-06-12']['total']['confirmed'] = 239

In [12]:
with open('./data/states_list.txt', 'r') as file:
    states = file.readlines()

for ind in range(len(states) - 1):
    states[ind] = states[ind][:-1]

In [13]:
df = pd.read_csv("./data/state_data.csv")
df.head(5)

Unnamed: 0,Rank,State or union territory,Population,National Share (%),Decadal growth(2001–2012),Rural population,Percent rural,Urban population,Percent urban,Area[16],Density[a],Sex ratio,Foreign Visits,Health Index
0,1,Uttar Pradesh,199812341,16.51,20.2%,155317278,77.73,44495063,22.27,"240,928 km2 (93,023 sq mi)",828.0,912,4745181,29.16
1,2,Maharashtra,112374333,9.28,20.0%,61556074,54.78,50818259,45.22,"307,713 km2 (118,809 sq mi)",365.0,929,5528704,64.53
2,3,Bihar,104099452,8.6,25.4%,92341436,88.71,11758016,11.29,"94,163 km2 (36,357 sq mi)",1102.0,918,1093141,32.42
3,4,West Bengal,91276115,7.54,13.8%,62183113,68.13,29093002,31.87,"88,752 km2 (34,267 sq mi)",1029.0,953,1656145,58.25
4,5,Madhya Pradesh,72626809,6.0,16.3%,52557404,72.37,20069405,27.63,"308,245 km2 (119,014 sq mi)",236.0,931,327958,38.69


In [14]:
actual_data = pd.read_csv("./data/actual_covid_data_17april.csv")
actual_data = actual_data.drop([0, 37]).reset_index(drop=True)
actual_data = actual_data.drop([32, 35]).reset_index(drop=True)
actual_data.iloc[5, 0] = 'NCT of Delhi'

In [15]:
# replacing with data right before the 1st wave (12th June 2020)

case_data = actual_data[['State', 'Confirmed']]
for state in case_data['State']:
    code = name_map_dict[state]
    case_data[case_data['State'] == state]['Confirmed'] = cases[code]['dates']['2020-06-12']['total']['confirmed']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
case_data['label'] = 0

label_1 = case_data['Confirmed'].quantile(q=1/3)
label_2 = case_data['Confirmed'].quantile(q=2/3)

case_data.iloc[case_data.index[case_data['Confirmed'] >= label_1], 2] = 1
case_data.iloc[case_data.index[case_data['Confirmed'] >= label_2], 2] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [17]:
y_true = []
for state in states:
    y_true.append(case_data[case_data['State'] == state]['label'].values[0])
y_true = np.array(y_true)

In [18]:
train_ind = [32, 27, 16, 3, 15, 20]
train_labels = [0, 0, 1, 1, 2, 2]

test_ind = []
for i in range(n_states):
    if i not in train_ind:
        test_ind.append(i)

## Encoding the features:

In [19]:
#embed = nn.Embedding(n_states, n_states)  

#x = torch.arange(0, n_states)
#G.ndata['feat'] = torch.nn.functional.one_hot(x).double()

#for i in range(n_states):
#    G.ndata['feat'][i] = (G.ndata['feat'][i] * df[df['State or union territory'] == states[i]]['National Share (%)'].values[0])
#    print(G.ndata['feat'][i])
# [21, 12, 5, 10, 27, 32]  [2, 1, 1, 2, 0, 0]

In [404]:
for i in train_ind:
    print(y_true[i], states[i], df[df['State or union territory'] == states[i]]['Foreign Visits'].values[0])

1 Haryana 48046
0 Dadra and Nagar Haveli and Daman and Diu 7369
2 Kerala 1189771
1 Telangana 323326
2 Chhattisgarh 6817
0 Sikkim 133388


In [377]:
# original
train_ind = [32, 27, 16, 3, 15, 20]
train_labels = [0, 0, 1, 1, 2, 2]

train_ind = [18, 12, 5, 10, 30, 32]  
train_labels = [2, 1, 1, 2, 0, 0]

#train_ind = random.sample(range(34), k=6)
#train_labels = [y_true[i] for i in train_ind]

#print(train_ind, train_labels)

#train_ind = random.sample([0, 1, 2, 3, 4, 5, 32, 33, 7], k=1) + random.sample([6, 9, 14, 15, 10], k=1) + random.sample([17, 18, 19, 20, 21, 22, 31], k=2) + random.sample([8, 11, 12, 13, 16], k=1) + random.sample([23, 24, 25, 26, 27, 28, 29, 30], k=1) 
#train_labels = [y_true[i] for i in train_ind]

# .82
train_ind = [5, 14, 21, 19, 11, 23] 
train_labels = [1, 0, 2, 1, 2, 0]

#train_ind = [30, 10, 20, 12, 32, 14] 
#train_labels = [y_true[i] for i in train_ind]

print(train_ind, train_labels)
test_ind = []
for i in range(n_states):
    if i not in train_ind:
        test_ind.append(i)

n_feats = 2
feats = torch.zeros(n_states, n_feats)
for i in range(n_states):
    
    density = df[df['State or union territory'] == states[i]]['Density[a]'].values[0]
    percent_urban = df[df['State or union territory'] == states[i]]['Percent urban'].values[0]
    health_index = df[df['State or union territory'] == states[i]]['Health Index'].values[0]
    foreign_visits = df[df['State or union territory'] == states[i]]['Foreign Visits'].values[0]
    percent_rural = df[df['State or union territory'] == states[i]]['Percent rural'].values[0]
    share = df[df['State or union territory'] == states[i]]['National Share (%)'].values[0]
    
    # adding the features we want: 
    feats[i] = (torch.Tensor([foreign_visits, density]))
    
feats = (feats - feats[train_ind, :].mean(axis=0)) / feats[train_ind, :].std(axis=0)

[5, 14, 21, 19, 11, 23] [1, 0, 2, 1, 2, 0]


In [44]:
acc_dict = {
    "svc": [],
    "knn": [],
    "rf" : [],
    "gcn": []
}

In [410]:
our_data = feats[train_ind, :]

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=32)
clf.fit(our_data, train_labels)
acc = (clf.predict(feats)[test_ind] == y_true[test_ind]).mean()
print(acc)
#acc_dict['rf'].append(acc)

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(our_data, train_labels)
acc = (clf.predict(feats)[test_ind] == y_true[test_ind]).mean()
print(acc)
#acc_dict['knn'].append(acc)     

clf = SVC()
clf.fit(our_data, train_labels)
acc = (clf.predict(feats)[test_ind] == y_true[test_ind]).mean()
print(acc)
#acc_dict['svc'].append(acc)

0.6428571428571429
0.35714285714285715
0.5714285714285714


## Creating and training the model:

In [379]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, num_classes)

    def forward(self, g, inputs):
        h = self.conv1(g, inputs)
        h = F.leaky_relu(h)
        h = self.conv2(g, h)
        return h

In [381]:
#embed = nn.Embedding(n_states, n_feats)  
#inputs = embed.weight
#labeled_nodes = torch.tensor([32, 27, 3, 4, 15, 20])  
#labels = torch.tensor([0, 0, 1, 1, 2, 2])  

labeled_nodes = torch.tensor(train_ind)  
labels = torch.tensor(train_labels) 

epochs = 200
net = GCN(n_feats, 12, 3)
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
all_logits = []

for epoch in range(epochs + 1):
    logits = net(G, feats)
    all_logits.append(logits.detach())
    logp = F.log_softmax(logits, dim=1)
    loss = F.nll_loss(logp[labeled_nodes], labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = (np.array(y_true)[train_ind] == all_logits[-1].argmax(axis=1).numpy()[train_ind]).mean()
        test_acc = (np.array(y_true)[test_ind] == all_logits[-1].argmax(axis=1).numpy()[test_ind]).mean()
        #print(f'Epoch: {epoch}\nLoss: {round(loss.item(), 4)} Test Accuracy: {round(acc, 4)} Train Acc: {round(acc2, 4)}')
        print("Epoch: %3d Train Loss: %.4f Train Acc: %.4f Val Acc: %.4f" % (epoch, round(loss.item(), 4), round(train_acc, 4), round(test_acc, 4)))
        if round(train_acc, 4) == 1:#0.8214:
            break

Epoch:   0 Train Loss: 1.2599 Train Acc: 0.3333 Val Acc: 0.3214
Epoch:  20 Train Loss: 0.8555 Train Acc: 0.6667 Val Acc: 0.5357
Epoch:  40 Train Loss: 0.7012 Train Acc: 0.8333 Val Acc: 0.6786
Epoch:  60 Train Loss: 0.5699 Train Acc: 0.8333 Val Acc: 0.8571
Epoch:  80 Train Loss: 0.4657 Train Acc: 0.8333 Val Acc: 0.8571
Epoch: 100 Train Loss: 0.3883 Train Acc: 1.0000 Val Acc: 0.8571


In [180]:
test_acc = (np.array(y_true)[test_ind] == all_logits[-1].argmax(axis=1).numpy()[test_ind]).mean()
print(test_acc)
acc_dict['gcn'].append(test_acc)

0.7857142857142857


In [200]:
#list(map(lambda el: acc_dict[el].pop(), acc_dict))

In [181]:
acc_dict

{'svc': [0.39285714285714285,
  0.42857142857142855,
  0.35714285714285715,
  0.32142857142857145,
  0.39285714285714285,
  0.5,
  0.2857142857142857,
  0.32142857142857145,
  0.42857142857142855,
  0.5714285714285714],
 'knn': [0.32142857142857145,
  0.39285714285714285,
  0.35714285714285715,
  0.42857142857142855,
  0.32142857142857145,
  0.32142857142857145,
  0.2857142857142857,
  0.2857142857142857,
  0.35714285714285715,
  0.35714285714285715],
 'rf': [0.35714285714285715,
  0.6428571428571429,
  0.6428571428571429,
  0.5,
  0.5714285714285714,
  0.5714285714285714,
  0.5357142857142857,
  0.6785714285714286,
  0.5,
  0.6071428571428571],
 'gcn': [0.6071428571428571,
  0.6785714285714286,
  0.75,
  0.6428571428571429,
  0.6071428571428571,
  0.7142857142857143,
  0.6071428571428571,
  0.7142857142857143,
  0.5714285714285714,
  0.7857142857142857]}

In [263]:
for mod in acc_dict:
    print(mod, max(acc_dict[mod]), sum(acc_dict[mod]) / len(acc_dict[mod]))

svc 0.5714285714285714 0.4
knn 0.42857142857142855 0.34285714285714286
rf 0.6785714285714286 0.5607142857142857
gcn 0.8214 0.6714257142857142


## Results:

In [388]:
op_df = pd.DataFrame(all_logits[-1][:].numpy())
op_list = []

for i in range(n_states):
    op_list.append((all_logits[-1][i].numpy().argmax()))

op_df['preds'] = pd.Series(op_list)
op_df['State'] = states

In [389]:
final_df = pd.merge(op_df, case_data, on='State')#.sort_values('label', ascending=False)[['State', 'preds', 'label']]
(final_df['preds'] == final_df['label']).mean()

0.8823529411764706

In [390]:
final_df

Unnamed: 0,0,1,2,preds,State,Confirmed,label
0,8.336432,2.739926,-6.44502,0,Ladakh,11709,0
1,1.359403,1.620766,-5.881773,1,Jammu and Kashmir,145166,1
2,-10.149055,-0.705744,-4.284385,1,Himachal Pradesh,75587,1
3,-4.837475,0.020606,-4.378413,1,Punjab,290707,1
4,-2.374469,0.07992,-2.926488,1,Uttarakhand,121403,1
5,-6.507901,-0.980952,-2.242189,1,Haryana,342077,1
6,-20.968472,-3.912248,0.516029,2,Rajasthan,404355,2
7,-13.620948,-2.756685,0.532975,2,Uttar Pradesh,821054,2
8,-4.334144,-0.934465,-1.348792,1,Bihar,307557,1
9,-1.96825,-0.765931,-0.761814,2,Gujarat,384688,2


In [391]:
final_df['preds'].value_counts()

2    13
1    11
0    10
Name: preds, dtype: int64

In [392]:
final_df['label'].value_counts()

2    12
1    11
0    11
Name: label, dtype: int64

In [393]:
final_df[final_df['preds'] != final_df['label']]

Unnamed: 0,0,1,2,preds,State,Confirmed,label
13,-0.201365,0.32839,-2.808434,1,West Bengal,651508,2
25,17.630684,7.942176,-14.792749,0,Assam,222940,1
31,-4.254689,-1.394898,-0.063553,2,Puducherry,47108,0
32,-23.397354,-4.2997,0.356637,2,Chandigarh,32878,0


In [394]:
#!pip install chart_studio
import plotly.express as px
import chart_studio.plotly as py

In [395]:
st = '''Andhra Pradesh	15.91°N	79.74°E
Arunachal Pradesh	28.21°N	94.72°E
Assam	26.20°N	92.93°E
Bihar	25.09°N	85.31°E
Chhattisgarh	21.27°N	81.86°E
Goa	15.29°N	74.12°E
Gujarat	22.25°N	71.19°E
Haryana	29.05°N	76.08°E
Himachal Pradesh	31.10°N	77.17°E
Jammu and Kashmir	33.77 °N	76.57°E
Jharkhand	23.61°N	85.27°E
Karnataka	15.31°N	75.71°E
Kerala	10.85°N	76.27°E
Madhya Pradesh	22.97°N	78.65°E
Maharashtra	19.75°N	75.71°E
Manipur	24.66°N	93.90°E
Meghalaya	25.46°N	91.36°E
Mizoram	23.16°N	92.93°E
Nagaland	26.15°N	94.56°E
Odisha	20.95°N	85.09°E
Punjab	31.14°N	75.34°E
Rajasthan	27.02°N	74.21°E
Sikkim	27.53°N	88.51°E
Tamil Nadu	11.12°N	78.65°E
Tripura	23.94°N	91.98°E
Uttarakhand	30.06°N	79.01°E
Uttar Pradesh	26.84°N	80.94°E
West Bengal	22.98°N	87.85°E'''

In [396]:
arr = st.split('\n')
vals = []

for i, ar in enumerate(arr):
    d = arr[i].split('\t')
    vals.append([d[0], float(d[1][:-2].strip()), float(d[2][:-2].strip())])
    
vals.append(['Dadra and Nagar Haveli and Daman and Diu', 20.26657819, 73.0166178])
vals.append(['Ladakh', 34.2268, 77.5619])
vals.append(['Puducherry', 11.9416, 79.8083])
vals.append(['NCT of Delhi',28.7041, 77.1025])
vals.append(['Chandigarh', 30.7333, 76.7794])
vals.append(['Telangana', 18.1124, 79.0193])

In [397]:
res = []
for i in states:
    f = False
    for st in vals:
        if st[0] == i:
            res.append(st)
            f = True
            break
    if not f:
        print(i)

In [398]:
final_df['lat'] = [val[1] for val in res]
final_df['long'] = [val[2] for val in res]

In [399]:
fig = px.scatter_geo(final_df, 
                     lat="lat", 
                     lon="long", 
                     color='label', 
                     size=final_df['label']+1, 
                     scope='asia', 
                     center={'lat':19,'lon':-99}, 
                     width=800, 
                     text=final_df['State'],
                     projection="natural earth")

fig.show()

'''
Chandigarh
Nagaland
Odisha
Punjab
Maharashtra
Andhra Pradesh
'''

'\nChandigarh\nNagaland\nOdisha\nPunjab\nMaharashtra\nAndhra Pradesh\n'

In [411]:
fig = px.scatter_geo(final_df, 
                     lat="lat", 
                     lon="long", 
                     color='preds', 
                     size=final_df['preds']+1, 
                     scope='asia', 
                     center={'lat':19,'lon':-99}, 
                     width=800, 
                     text=final_df['State'],
                     projection="natural earth")

fig.show()