In [5]:
from neo4j import GraphDatabase
import numpy as np
import altair as alt
import pandas as pd
from tqdm import tqdm

In [6]:
embed_dim = 16

l = []
for i in range(0, embed_dim):
    l.append("embed_"+str(i))

In [7]:
class Neo4jConnection:
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print('Failed to create the driver:', e)
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, 'Driver not initialized!'
        session = None
        response = None
        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print('Query failed:', e)
        finally:
            if session is not None:
                session.close()
        return response

In [10]:
data = pd.read_csv(r'2022-08-13_State.csv')
data.rename(columns={'Date': 'date'}, inplace=True)
data.rename(columns={'Deaths': 'new_deaths'}, inplace=True)
data.rename(columns={'Confirmed': 'confirmed'}, inplace=True)
data.rename(columns={'People_Tested': 'people_tested'}, inplace=True)
data.rename(columns={'People_Hospitalized': 'people_hospitalized'}, inplace=True)
data = data.loc[data['Province_State'] == "New York"]

data['date'] = pd.to_datetime(data['date'])
data['date'] = [d.date() for d in data['date']]
data.reset_index(inplace=True, drop= True)
data = data.sort_values(by="date")
data.columns
data = data[['date','new_deaths','confirmed', 'people_tested', 'people_hospitalized']]
data[['new_deaths']] = data[['new_deaths']].diff().fillna(data)
data[['confirmed']] = data[['confirmed']].diff().fillna(data)
data[['people_tested']] = data[['people_tested']].diff().fillna(data)
data[['people_hospitalized']] = data[['people_hospitalized']].diff().fillna(data)


col = len(data.columns)
observed = data[['date','new_deaths']]
data = data.iloc[1:]
data['new_deaths'] = data['new_deaths'].abs()
observed['new_deaths'] = data['new_deaths'].abs()
data.reset_index(inplace=True, drop= True)

uri = 'bolt://localhost:7687'
pwd ='Covid-19KG'
conn = Neo4jConnection(uri=uri, user='neo4j', pwd=pwd)
conn.query('MATCH (n:State) RETURN COUNT(n) AS count')

for i in tqdm(range(0, data.shape[0])):
    if(i<=10):
        query = '''Match (s:State) -[:REPORTED_IN]- (c:Reports)
            Match (s:State) - [:HAS_MOBILITY] - (m:Mobility)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) -[:HAS_INCOME] - (i:Income)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) -[:HAS_EMPLOYMENT] -(e2:Employment)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) -[:HAS_HEALTH_INSURANCE] - (h:HealthInsurance)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) - [:HAS_COMMUTING] - (c3:Commuting)
            Match (s:State) - [:HAS_SOCIAL_CHARACTERISTICS] - (s2:SocialCharacteristics) - [:HAS_EDUCATION] - (e3:Education)
            Match (s:State) - [:HAS_SOCIAL_CHARACTERISTICS]- (s2:SocialCharacteristics) - [:HAS_COMPUTERS] - (c2:Computers)
            where c.date = date('''+'\''+str(data.iloc[i,0])+'\''+''') and m.date = date('''+'\''+str(data.iloc[i,0])+'\''+''')
            return c.embedding AS Cases, s.name  as Name, s.fips as FIPS
            '''
        df = pd.DataFrame([dict(_) for _ in conn.query(query)])
        embd = df[df['Name'] == 'Georgia']['Cases'].values[0]
        for i_df in range(0,embed_dim):
            data.at[i, l[i_df]] = embd[i_df]
            
    elif(i>10):
        query = '''Match (s:State) -[:REPORTED_IN]- (c:Reports)
            Match (s:State) - [:HAS_MOBILITY] - (m:Mobility)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) -[:HAS_INCOME] - (i:Income)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) -[:HAS_EMPLOYMENT] -(e2:Employment)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) -[:HAS_HEALTH_INSURANCE] - (h:HealthInsurance)
            Match (s:State) - [:HAS_ECONOMICS]-(e:Economics) - [:HAS_COMMUTING] - (c3:Commuting)
            Match (s:State) - [:HAS_SOCIAL_CHARACTERISTICS] - (s2:SocialCharacteristics) - [:HAS_EDUCATION] - (e3:Education)
            Match (s:State) - [:HAS_SOCIAL_CHARACTERISTICS]- (s2:SocialCharacteristics) - [:HAS_COMPUTERS] - (c2:Computers)
            where c.date >= date('''+'\''+str(data.iloc[i-10,0])+'\''+''') and c.date <= date('''+'\''+str(data.iloc[i,0])+'\''+''') 
            and   m.date >= date('''+'\''+str(data.iloc[i-10,0])+'\''+''') and m.date <= date('''+'\''+str(data.iloc[i,0])+'\''+''')
            return c.embedding AS Cases, s.name as Name, s.fips as FIPS
            '''
        df = pd.DataFrame([dict(_) for _ in conn.query(query)])
        embd = df[df['Name'] == 'Georgia']['Cases'].values[0]
        for i_df in range(0,embed_dim):
            data.at[i, l[i_df]] = embd[i_df]
        
data.to_csv('GA_with_embeddings_v2.csv', index=False)


100%|█████████████████████████████████████████| 852/852 [15:30<00:00,  1.09s/it]


In [17]:
df

Unnamed: 0,Cases,Name,FIPS
0,"[0.21844136500195344, 0.20341457760235743, 0.2...",Alabama,01
1,"[0.21844136500195344, 0.20341457760235743, 0.2...",Alabama,01
2,"[0.21844136500195344, 0.20341457760235743, 0.2...",Alabama,01
3,"[0.21844136500195344, 0.20341457760235743, 0.2...",Alabama,01
4,"[0.21844136500195344, 0.20341457760235743, 0.2...",Alabama,01
...,...,...,...
6166,"[0.21203908485637726, 0.19571728294464733, 0.2...",Wyoming,56
6167,"[0.21203908485637726, 0.19571728294464733, 0.2...",Wyoming,56
6168,"[0.21203908485637726, 0.19571728294464733, 0.2...",Wyoming,56
6169,"[0.21203908485637726, 0.19571728294464733, 0.2...",Wyoming,56
