## Imports

In [1]:
import dgl
import numpy as np
import pandas as pd
import networkx as nx
import random
import itertools

from helper_functions import build_map_1

Using backend: pytorch


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv

## Creating the graph:

In [3]:
# CONSTANTS

n_states = 34

In [4]:
G = dgl.DGLGraph()
G.add_nodes(n_states)
u, v = build_map_1()
G = dgl.graph((u, v))



In [5]:
print(G.number_of_nodes())
print(G.number_of_edges())
print(G)

34
128
Graph(num_nodes=34, num_edges=128,
      ndata_schemes={}
      edata_schemes={})


In [5]:
nx_G = G.to_networkx().to_undirected()
pos = nx.kamada_kawai_layout(nx_G)

nx.draw(nx_G, pos, with_labels=True, node_color=[[.9, .3, .3]])

## Loading the data:

In [6]:
with open('./data/states_list.txt', 'r') as file:
    states = file.readlines()

for ind in range(len(states) - 1):
    states[ind] = states[ind][:-1]

In [7]:
df = pd.read_csv("./data/state_data.csv")
df.head(5)

Unnamed: 0,Rank,State or union territory,Population,National Share (%),Decadal growth(2001–2012),Rural population,Percent rural,Urban population,Percent urban,Area[16],Density[a],Sex ratio,Foreign Visits,Health Index
0,1,Uttar Pradesh,199812341,16.51,20.2%,155317278,77.73,44495063,22.27,"240,928 km2 (93,023 sq mi)",828.0,912,4745181,29.16
1,2,Maharashtra,112374333,9.28,20.0%,61556074,54.78,50818259,45.22,"307,713 km2 (118,809 sq mi)",365.0,929,5528704,64.53
2,3,Bihar,104099452,8.6,25.4%,92341436,88.71,11758016,11.29,"94,163 km2 (36,357 sq mi)",1102.0,918,1093141,32.42
3,4,West Bengal,91276115,7.54,13.8%,62183113,68.13,29093002,31.87,"88,752 km2 (34,267 sq mi)",1029.0,953,1656145,58.25
4,5,Madhya Pradesh,72626809,6.0,16.3%,52557404,72.37,20069405,27.63,"308,245 km2 (119,014 sq mi)",236.0,931,327958,38.69


In [99]:
actual_data = pd.read_csv("./data/actual_covid_data_25may.csv")
actual_data.iloc[9, 0] = 'NCT of Delhi'
actual_data = actual_data.drop([0, 31, 19, 1]).reset_index(drop=True)

In [100]:
case_data = actual_data[['State', 'Confirmed']]
case_data['label'] = 0

label_1 = case_data['Confirmed'].quantile(q=1/3)
label_2 = case_data['Confirmed'].quantile(q=2/3)

case_data.iloc[case_data.index[case_data['Confirmed'] >= label_1], 2] = 1
case_data.iloc[case_data.index[case_data['Confirmed'] >= label_2], 2] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [101]:
y_true = []
for state in states:
    y_true.append(case_data[case_data['State'] == state]['label'].values[0])
y_true = np.array(y_true)

## Encoding the features:

In [102]:
#embed = nn.Embedding(n_states, n_states)  

#x = torch.arange(0, n_states)
#G.ndata['feat'] = torch.nn.functional.one_hot(x).double()

#for i in range(n_states):
#    G.ndata['feat'][i] = (G.ndata['feat'][i] * df[df['State or union territory'] == states[i]]['National Share (%)'].values[0])
#    print(G.ndata['feat'][i])
# [21, 12, 5, 10, 27, 32]  [2, 1, 1, 2, 0, 0]

In [103]:
[i for i in range(len(y_true)) if y_true[i] == 0]

[0, 14, 23, 24, 26, 27, 28, 29, 30, 31, 32]

In [104]:
# original
train_ind = [32, 27, 16, 3, 15, 20]
train_labels = [0, 0, 1, 1, 2, 2]

train_ind = [32, 27, 4, 3, 15, 20]
train_labels = [0, 0, 1, 1, 2, 2]

#best
train_ind = [18, 12, 5, 10, 27, 32]  
train_labels = [2, 1, 1, 2, 0, 2]

#train_ind = [21, 12, 3, 10, 27, 32]  
#train_labels = [2, 1, 1, 2, 0, 0]
#random.sample([i for i in range(len(y_true)) if y_true[i] == 0], k=2) \
#train_ind = [32, 27] \
#        + random.sample([i for i in range(len(y_true)) if y_true[i] == 1], k=2) \
#        + random.sample([i for i in range(len(y_true)) if y_true[i] == 2], k=2)

train_labels = [y_true[i] for i in train_ind]

print(train_ind, train_labels)

test_ind = []
for i in range(n_states):
    if i not in train_ind:
        test_ind.append(i)

n_feats = 2
feats = torch.zeros(n_states, n_feats)
for i in range(n_states):
    
    density = df[df['State or union territory'] == states[i]]['Density[a]'].values[0]
    percent_urban = df[df['State or union territory'] == states[i]]['Percent urban'].values[0]
    health_index = df[df['State or union territory'] == states[i]]['Health Index'].values[0]
    foreign_visits = df[df['State or union territory'] == states[i]]['Foreign Visits'].values[0]
    percent_rural = df[df['State or union territory'] == states[i]]['Percent rural'].values[0]
    share = df[df['State or union territory'] == states[i]]['National Share (%)'].values[0]
    
    # adding the features we want: 
    feats[i] = (torch.Tensor([foreign_visits, density]))
    
feats = (feats - feats[train_ind, :].mean(axis=0)) / feats[train_ind, :].std(axis=0)

[18, 12, 5, 10, 27, 32] [2, 1, 1, 2, 0, 0]


In [105]:
our_data = feats[train_ind, :]

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
clf = LogisticRegression(random_state=32)

clf.fit(our_data, train_labels)

(clf.predict(feats)[test_ind] == y_true[test_ind]).mean()

0.5

In [106]:
for i in train_ind:
    print(states[i])

Karnataka
Jharkhand
Haryana
Madhya Pradesh
Nagaland
Chandigarh


In [107]:
print([i for i in range(len(states)) if states[i] == 'Himachal Pradesh'])
y_true[2]

[2]


1

## Creating and training the model:

In [108]:
from dgl.nn.pytorch.conv.sageconv import SAGEConv # tried it

In [109]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, num_classes) #aggregator_type='lstm') 

    def forward(self, g, inputs):
        h = self.conv1(g, inputs)
        h = F.leaky_relu(h) #weight=torch.Tensor([0.01]))
        h = self.conv2(g, h)
        return h

In [112]:
#embed = nn.Embedding(n_states, n_feats)  
#inputs = embed.weight
#labeled_nodes = torch.tensor([32, 27, 3, 4, 15, 20])  
#labels = torch.tensor([0, 0, 1, 1, 2, 2])  

labeled_nodes = torch.tensor(train_ind)  
labels = torch.tensor(train_labels) 

epochs = 500
net = GCN(n_feats, 12, 3)
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
all_logits = []

for epoch in range(epochs + 1):
    logits = net(G, feats)
    all_logits.append(logits.detach())
    logp = F.log_softmax(logits, dim=1)
    loss = F.nll_loss(logp[labeled_nodes], labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = (np.array(y_true)[train_ind] == all_logits[-1].argmax(axis=1).numpy()[train_ind]).mean()
        test_acc = (np.array(y_true)[test_ind] == all_logits[-1].argmax(axis=1).numpy()[test_ind]).mean()
        print("Epoch: %3d Train Loss: %.4f Train Acc: %.4f Val Acc: %.4f" % (epoch, round(loss.item(), 4), round(train_acc, 4), round(test_acc, 4)))
        #if round(test_acc, 4) == 0.7857:
        #    break

Epoch:   0 Train Loss: 1.0908 Train Acc: 0.3333 Val Acc: 0.2500
Epoch:  20 Train Loss: 0.9089 Train Acc: 0.5000 Val Acc: 0.6429
Epoch:  40 Train Loss: 0.8432 Train Acc: 0.5000 Val Acc: 0.6786
Epoch:  60 Train Loss: 0.7922 Train Acc: 0.5000 Val Acc: 0.6429
Epoch:  80 Train Loss: 0.7493 Train Acc: 0.6667 Val Acc: 0.6786
Epoch: 100 Train Loss: 0.7124 Train Acc: 0.6667 Val Acc: 0.7143
Epoch: 120 Train Loss: 0.6791 Train Acc: 0.8333 Val Acc: 0.7143
Epoch: 140 Train Loss: 0.6488 Train Acc: 0.8333 Val Acc: 0.7500
Epoch: 160 Train Loss: 0.6212 Train Acc: 0.8333 Val Acc: 0.7500
Epoch: 180 Train Loss: 0.5957 Train Acc: 0.8333 Val Acc: 0.7143
Epoch: 200 Train Loss: 0.5719 Train Acc: 0.8333 Val Acc: 0.7143
Epoch: 220 Train Loss: 0.5495 Train Acc: 1.0000 Val Acc: 0.7143
Epoch: 240 Train Loss: 0.5284 Train Acc: 1.0000 Val Acc: 0.7143
Epoch: 260 Train Loss: 0.5085 Train Acc: 1.0000 Val Acc: 0.6786
Epoch: 280 Train Loss: 0.4896 Train Acc: 1.0000 Val Acc: 0.6786
Epoch: 300 Train Loss: 0.4719 Train Acc:

## Results:

In [61]:
op_df = pd.DataFrame(all_logits[-1][:].numpy())
op_list = []

for i in range(n_states):
    op_list.append((all_logits[-1][i].numpy().argmax()))

op_df['preds'] = pd.Series(op_list)
op_df['State'] = states

In [62]:
final_df = pd.merge(op_df, case_data, on='State')#.sort_values('label', ascending=False)[['State', 'preds', 'label']]
(final_df['preds'] == final_df['label']).mean()

0.7352941176470589

In [65]:
#final_df

In [66]:
final_df['preds'].value_counts()

2    14
0    12
1     8
Name: preds, dtype: int64

In [67]:
final_df['label'].value_counts()

2    12
1    11
0    11
Name: label, dtype: int64

In [68]:
final_df[final_df['preds'] != final_df['label']]

Unnamed: 0,0,1,2,preds,State,Confirmed,label
1,2.632194,0.71784,-3.105646,0,Jammu and Kashmir,145166,1
14,-0.174238,0.072962,-0.30518,1,Dadra and Nagar Haveli and Daman and Diu,4568,0
17,-0.619922,-0.804471,1.034925,2,Goa,66261,1
19,-2.134142,-0.786212,2.087626,2,Telangana,346331,1
25,4.477033,2.113843,-6.191648,0,Assam,222940,1


In [69]:
#!pip install chart_studio
import plotly.express as px
import chart_studio.plotly as py

In [70]:
st = '''Andhra Pradesh	15.91°N	79.74°E
Arunachal Pradesh	28.21°N	94.72°E
Assam	26.20°N	92.93°E
Bihar	25.09°N	85.31°E
Chhattisgarh	21.27°N	81.86°E
Goa	15.29°N	74.12°E
Gujarat	22.25°N	71.19°E
Haryana	29.05°N	76.08°E
Himachal Pradesh	31.10°N	77.17°E
Jammu and Kashmir	33.77 °N	76.57°E
Jharkhand	23.61°N	85.27°E
Karnataka	15.31°N	75.71°E
Kerala	10.85°N	76.27°E
Madhya Pradesh	22.97°N	78.65°E
Maharashtra	19.75°N	75.71°E
Manipur	24.66°N	93.90°E
Meghalaya	25.46°N	91.36°E
Mizoram	23.16°N	92.93°E
Nagaland	26.15°N	94.56°E
Odisha	20.95°N	85.09°E
Punjab	31.14°N	75.34°E
Rajasthan	27.02°N	74.21°E
Sikkim	27.53°N	88.51°E
Tamil Nadu	11.12°N	78.65°E
Tripura	23.94°N	91.98°E
Uttarakhand	30.06°N	79.01°E
Uttar Pradesh	26.84°N	80.94°E
West Bengal	22.98°N	87.85°E'''

In [71]:
arr = st.split('\n')
vals = []

for i, ar in enumerate(arr):
    d = arr[i].split('\t')
    vals.append([d[0], float(d[1][:-2].strip()), float(d[2][:-2].strip())])
    
vals.append(['Dadra and Nagar Haveli and Daman and Diu', 20.26657819, 73.0166178])
vals.append(['Ladakh', 34.2268, 77.5619])
vals.append(['Puducherry', 11.9416, 79.8083])
vals.append(['NCT of Delhi',28.7041, 77.1025])
vals.append(['Chandigarh', 30.7333, 76.7794])
vals.append(['Telangana', 18.1124, 79.0193])

In [72]:
res = []
for i in states:
    f = False
    for st in vals:
        if st[0] == i:
            res.append(st)
            f = True
            break
    if not f:
        print(i)

In [73]:
final_df['lat'] = [val[1] for val in res]
final_df['long'] = [val[2] for val in res]

In [74]:
fig = px.scatter_geo(final_df, 
                     lat="lat", 
                     lon="long", 
                     color='label', 
                     size=final_df['label']+1, 
                     scope='asia', 
                     center={'lat':19,'lon':-99}, 
                     width=800, 
                     text=final_df['State'],
                     projection="natural earth")

fig.show()
'''
Andha Pradesh 2
Jharkhand  1
Haryana 1
Madhya Pradesh 2
Nagaland 0
Punjab  0
'''

'\nAndha Pradesh 2\nJharkhand  1\nHaryana 1\nMadhya Pradesh 2\nNagaland 0\nPunjab  0\n'

In [75]:
fig = px.scatter_geo(final_df, 
                     lat="lat", 
                     lon="long", 
                     color='preds', 
                     size=final_df['preds']+1, 
                     scope='asia', 
                     center={'lat':19,'lon':-99},
                     width=800, 
                     text=final_df['State'],
                     projection="natural earth")

fig.show()