<a href="https://colab.research.google.com/github/rajeevsai12/SemanticsConversionAndDataGeneration/blob/main/RDFConversionModifiedFinalCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip3 install rdflib

from rdflib import Graph, Literal, Namespace, RDF, URIRef, RDFS
import csv
import datetime
from rdflib.namespace import XSD

# Create a namespace for the data
data_ns = Namespace("http://example.org/data/")

# Create an RDF graph
g = Graph()

# Define RDF classes and properties
COVID = URIRef("http://example.org/COVID")
COVID_Deaths = URIRef("http://example.org/COVID_Deaths")
Deaths_from_All_Causes = URIRef("http://example.org/Deaths_from_All_Causes")
State = URIRef("http://example.org/State")
County = URIRef("http://example.org/County")
hasState = URIRef("http://example.org/hasState")
hasCounty = URIRef("http://example.org/hasCounty")
hasCOVID = URIRef("http://example.org/hasCOVID")
hasDeaths = URIRef("http://example.org/hasDeaths")
hasFootnote = URIRef("http://example.org/hasFootnote")
hasSex = URIRef("http://example.org/hasSex")
hasAgeYears = URIRef("http://example.org/hasAgeYears")
hasTotalDeaths = URIRef("http://example.org/hasTotalDeaths")
hasCOVIDDeaths = URIRef("http://example.org/hasCOVIDDeaths")
hasFIPSCode = URIRef("http://example.org/hasFIPSCode")
hasUrbanRuralCode = URIRef("http://example.org/hasUrbanRuralCode")
hasStartDate = URIRef("http://example.org/hasStartDate")
hasEndDate = URIRef("http://example.org/hasEndDate")

g.bind("data", data_ns)

#Define Relation between Classes using Object Type Property
g.add((data_ns.hasCovidDeaths, RDFS.domain, County))
g.add((data_ns.hasCovidDeaths, RDFS.range, COVID_Deaths))

g.add((data_ns.hasDeaths, RDFS.domain, County))
g.add((data_ns.hasDeaths, RDFS.range, Deaths_from_All_Causes))

g.add((data_ns.hasStartDate, RDFS.domain, County))
g.add((data_ns.hasStartDate, RDFS.range, COVID_Deaths))

g.add((data_ns.hasEndDate, RDFS.domain, County))
g.add((data_ns.hasEndDate, RDFS.range, COVID_Deaths))

#Define Data Type Properties for County Class
g.add((data_ns.hasCovidDeaths, RDFS.domain, County))
g.add((data_ns.hasCovidDeaths, RDFS.range, XSD.integer))
g.add((data_ns.hasDeaths, RDFS.domain, County))
g.add((data_ns.hasDeaths, RDFS.range, XSD.integer))
g.add((data_ns.hasStartDate, RDFS.domain, County))
g.add((data_ns.hasStartDate, RDFS.range, XSD.date))
g.add((data_ns.hasEndDate, RDFS.domain, County))
g.add((data_ns.hasEndDate, RDFS.range, XSD.date))
g.add((data_ns.hasCounty, RDFS.domain, County))
g.add((data_ns.hasCounty, RDFS.range, XSD.string))
g.add((data_ns.hasUrbanRuralCode, RDFS.domain, County))
g.add((data_ns.hasUrbanRuralCode, RDFS.range, XSD.string))
g.add((data_ns.hasState, RDFS.domain, County))
g.add((data_ns.hasState, RDFS.range, XSD.string))
g.add((data_ns.hasFIPSCode, RDFS.domain, County))
g.add((data_ns.hasFIPSCode, RDFS.range, XSD.string))

#Define Data Type Properties for Deaths_from_All_Causes Class
g.add((data_ns.hasDeaths, RDFS.domain, Deaths_from_All_Causes))
g.add((data_ns.hasDeaths, RDFS.range, XSD.integer))

#Define Data Type Properties for COVID_Deaths Class
g.add((data_ns.hasSex, RDFS.domain, COVID_Deaths))
g.add((data_ns.hasSex, RDFS.range, XSD.string))
g.add((data_ns.hasStartDate, RDFS.domain, COVID_Deaths))
g.add((data_ns.hasStartDate, RDFS.range, XSD.date))
g.add((data_ns.hasCovidDeaths, RDFS.domain, COVID_Deaths))
g.add((data_ns.hasCovidDeaths, RDFS.range, XSD.integer))
g.add((data_ns.hasEndDate, RDFS.domain, COVID_Deaths))
g.add((data_ns.hasEndDate, RDFS.range, XSD.date))
g.add((data_ns.hasAgeYears, RDFS.domain, COVID_Deaths))
g.add((data_ns.hasAgeYears, RDFS.range, XSD.string))
g.add((data_ns.hasTotalDeaths, RDFS.domain, COVID_Deaths))
g.add((data_ns.hasTotalDeaths, RDFS.range, XSD.integer))

#Define Data Type Properties for COVID Class
g.add((data_ns.hasCOVID, RDFS.domain, COVID))
g.add((data_ns.hasCOVID, RDFS.range, XSD.integer))

# Read the first CSV file
with open('Age.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_as_of = datetime.datetime.strptime(row['Data as of'], '%m/%d/%Y')
        start_date = datetime.datetime.strptime(row['Start Date'], '%m/%d/%Y')
        end_date = datetime.datetime.strptime(row['End Date'], '%m/%d/%Y')
        sex = row['Sex']
        age_years = row['Age Years']
        total_deaths = int(row['Total deaths'])
        covid_deaths = int(row['COVID-19 Deaths'])

        # Create an RDF node for each row
        node = URIRef(data_ns + str(data_as_of) + sex + age_years)

        # Add triples to the graph
        g.add((node, RDF.type, COVID_Deaths))
        g.add((node, hasSex, Literal(sex)))
        g.add((node, hasAgeYears, Literal(age_years)))
        g.add((node, hasTotalDeaths, Literal(total_deaths)))
        g.add((node, hasCOVIDDeaths, Literal(covid_deaths)))
        g.add((node, hasStartDate, Literal(start_date)))
        g.add((node, hasEndDate, Literal(end_date)))

# Read the second CSV file
with open('County.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_as_of = datetime.datetime.strptime(row['Date as of'], '%m/%d/%Y')
        start_date = datetime.datetime.strptime(row['Start Date'], '%m/%d/%Y')
        end_date = datetime.datetime.strptime(row['End Date'], '%m/%d/%Y')
        state = row['State']
        county_name = row['County name']
        fips_county_code =row['FIPS County Code']

        if row['Urban Rural Code']:
            urban_rural_code = row['Urban Rural Code']
        else:
            urban_rural_code = "Unknown"

        if row['Deaths involving COVID-19']:
            covid_deaths = int(row['Deaths involving COVID-19'])
        else:
            covid_deaths = None

        if row['Deaths from All Causes']:
            all_deaths = int(row['Deaths from All Causes'])
        else:
            all_deaths = None

        # Create an RDF node for each row
        node = URIRef(data_ns + str(data_as_of) + state + county_name)

        # Add triples to the graph
        g.add((node, RDF.type, County))
        g.add((node, hasState, Literal(state)))
        g.add((node, hasCounty, Literal(county_name)))
        g.add((node, hasFIPSCode, Literal(fips_county_code)))
        g.add((node, hasUrbanRuralCode, Literal(urban_rural_code)))
        g.add((node, hasStartDate, Literal(start_date)))
        g.add((node, hasEndDate, Literal(end_date)))

        if covid_deaths is not None:
            covid_node = URIRef(data_ns + str(data_as_of) + state + county_name + "COVID")
            g.add((covid_node, RDF.type, COVID))
            g.add((covid_node, hasCOVID, Literal(covid_deaths)))
            g.add((node, hasCOVIDDeaths, covid_node))

        if all_deaths is not None:
            deaths_node = URIRef(data_ns + str(data_as_of) + state + county_name + "deaths")
            g.add((deaths_node, RDF.type, Deaths_from_All_Causes))
            g.add((deaths_node, hasDeaths, Literal(all_deaths)))
            g.add((node, hasDeaths, deaths_node))

        if row['Footnote']:
            footnote_node = URIRef(data_ns + str(data_as_of) + state + county_name + "footnote")
            g.add((footnote_node, RDF.type, Literal(row['Footnote'])))
            g.add((node, hasFootnote, footnote_node))

g.serialize(destination='FinalSemanticFile.rdf',format='xml')
