# Qri Clean/Maintain Notebook

This notebook is structured to improve dataset maintenance workflow

In [1]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import subprocess
from io import StringIO
import os
from datetime import timedelta,date

## Defining the Qri class object
This object contains functions for basic Qri tasks such as loading the dataset body in to a Pandas datafram, initializing a new dataset, and saving & publishing a dataset.

In [2]:
class Qri():
    ## initialize the working dataset
    def __init__(self, dataset):
        self.dataset = dataset
        
    ## to get the CSV Body of a dataset
    def get_csv(self):
        cmd = f'qri get body {self.dataset}'
        csv = StringIO(subprocess.check_output(cmd, shell=True).decode("utf-8"))
        return pd.read_csv(csv)
    
    ## to get the JSON Body of a dataset
    def get_json(self):
        cmd = f'qri get body {self.dataset}'
        json = StringIO(subprocess.check_output(cmd, shell=True).decode("utf-8"))
        return pd.read_json(json)
    
    ## save the body of a dataset passing the filename and the working dir (if different from current)
    def save_body(self, file, wdir=os.getcwd()):
        os.chdir(wdir)
        return f"qri save --body {file} {self.dataset}";
    
    ## publish dataset
    def publish(self, wdir=os.getcwd()):
        os.chdir(wdir)
        return f"qri publish {self.dataset}"

### Note:
When using the save or publish functions, the returned string needs to be placed in curly brackets preceded by an exclamation point. This is how Jupyter Notebook runs terminal commands.

#### Example:

In [None]:
# !{Qri('dataset_name').publish()}

### Loading the data

This dataset consists of aggregated csv files. The function below reads the csv into a dataframe and stores in an array. The array is then concantenated and returned.

In [3]:
def read_and_concat(month):
    dfs = []
    for day in month:
        dfs.append(pd.read_csv(f'/home/xristos/datasets/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/{day}.csv'))
    return pd.concat(dfs,ignore_index=True)

In [6]:
## creating a list of date ranges for concat function
dates = [date(2020,7,6) - timedelta(x) for x in range(7)]
dates = [x.strftime("%m-%d-%Y") for x in dates]
dates.reverse()

In [9]:
df = read_and_concat(dates)
df.tail() ## checking the tail of dataframe to confirm the latest date

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio
26559,,,,Yemen,2020-07-07 04:34:00,15.552727,48.516388,1284,345,575,364.0,Yemen,4.304973,26.869159
26560,,,,Zambia,2020-07-07 04:34:00,-13.133897,27.849332,1632,30,1348,254.0,Zambia,8.877306,1.838235
26561,,,,Zimbabwe,2020-07-07 04:34:00,-19.015438,29.154857,734,9,197,528.0,Zimbabwe,4.938462,1.226158
26562,41069.0,Wheeler,Oregon,US,2020-07-06 19:33:59,44.726982,-120.028143,0,0,0,0.0,"Wheeler, Oregon, US",0.0,
26563,,,Unknown,Chile,2020-07-03 15:33:50,,,0,0,109,-109.0,"Unknown, Chile",,


### Creating CSV file from DataFrame

In [10]:
df.to_csv('~/datasets/covid-jh/july/july_daily_covid19_jh.csv',
               index=False,
               header=list(df.columns))

### Save and Publish with Qri

In [12]:
## initialize the name of the dataset in the Qri object
ds = Qri('me/july_daily_covid19_jh')
## create save and publish strings for terminal operations
save = ds.save_body(file='july_daily_covid19_jh.csv',wdir='/home/xristos/datasets/covid-jh/july/')
publish = ds.publish(wdir='/home/xristos/datasets/covid-jh/july/')

In [13]:
## Suggestion: hide output when saving larger datasets by ending the line with a semicolon (;)
!{save}

[37m⠋[0[K[K[37m⠙[0[K[K[37m⠚[0[K[K[37m⠒[0[K[K[37m⠂[0[K[K[37m⠂[0[K[K[37m⠒[0[K[K[37m⠲[0[K[K[37m⠴[0[K[K[37m⠦[0[K[K[37m⠖[0[K[K[37m⠒[0[K[K[37m⠐[0[K[K[37m⠐[0[K[K[37m⠒[0[K[K[32mdataset saved: xristosk/july_daily_covid19_jh@/ipfs/QmfQyMBpKXnpXFV2qj5CxqtmVtDNDBHvFjUNogdcsd83VQ[0m


In [14]:
!{publish}

0/21 blocks transferred
1/21 blocks transferred
2/21 blocks transferred
3/21 blocks transferred
4/21 blocks transferred
5/21 blocks transferred
6/21 blocks transferred
7/21 blocks transferred
8/21 blocks transferred
9/21 blocks transferred
10/21 blocks transferred
11/21 blocks transferred
12/21 blocks transferred
13/21 blocks transferred
14/21 blocks transferred
15/21 blocks transferred
16/21 blocks transferred
17/21 blocks transferred
18/21 blocks transferred
19/21 blocks transferred
20/21 blocks transferred
21/21 blocks transferred
done!
published dataset xristosk/july_daily_covid19_jh@QmRTirNhEPZVidxLZkjVFPMzM4M3gxddcTA1sBkdasGtw3/ipfs/QmfQyMBpKXnpXFV2qj5CxqtmVtDNDBHvFjUNogdcsd83VQ
