In [29]:
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pathlib

# Section 1: Exploritory Data Analysis
#### In this Section we will load and examine our datasets for analysis.

## Updating Our Data:
We will first start by checking whether our data has been updated.
We are updating data daily so we can track 
the accuracy of our model, along with potential changes
we will need to make in terms of identifying features and general EDA.

In [30]:
##Path to daily report to be deleted
path_to_daily_report = '~/Desktop/dataFinal/data/05-05-2020.csv'

daily_report_day = path_to_daily_report[-10]
yesterday = str(int(str(datetime.date.today())[-1]) -2)

#If daily report file is from yesterday, data has not been updated today: remove yesterday's data and update HTTPS
error_updated = 'Daily report paths already updated, no need to remove'
error_not_updated = 'Daily report paths not updated for todays data, update "path_to_daily_report" and remove from repo'
error_code = "";
if (daily_report_day != yesterday):
    error_code = error_updated
    print(error_updated)
else:
    error_code = error_not_updated;
    print(error_not_updated)
    path_to_daily_report_updated = path_to_daily_report.replace(path_to_daily_report[-10], str(int(path_to_daily_report[-10]) +1))
    print("")
    print("Updated URL: " + path_to_daily_report_updated)



Daily report paths already updated, no need to remove


In [31]:
if (error_code == error_updated):
    print(error_updated)
else:
    !rm -rf ~/Desktop/dataFinal/data/05-04-2020.csv
    !rm -rf ~/Desktop/dataFinal/data/time_series_covid19_confirmed_US.csv
    !rm -rf ~/Desktop/dataFinal/data/time_series_covid19_deaths_US.csv
        

Daily report paths already updated, no need to remove


# Load Most Recent Data into 'data/' repository with wget
#### We are using wget to load the most recent data so our model will run on larger datasets and produce more accurate predictions

In [32]:
todays_report_https = 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports_us/05-05-2020.csv'

todays_report_https[-10]
today = str(int(str(datetime.date.today())[-1])-1)
if (todays_report_https[-10] != today):
    print('Update date for daily_report https')
else:
    print('todays_report_https updated, please paste in loading cell')


todays_report_https updated, please paste in loading cell


In [18]:
!wget https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv -P ~/Desktop/dataFinal/data
    
!wget https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv -P ~/Desktop/dataFinal/data
    
!wget https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports_us/05-05-2020.csv -P ~/Desktop/dataFinal/data
   

--2020-05-06 13:41:12--  https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘/Users/ryanhurst/Desktop/dataFinal/data/time_series_covid19_confirmed_US.csv’

time_series_covid19     [ <=>                ]  68.33K  --.-KB/s    in 0.06s   

2020-05-06 13:41:13 (1.07 MB/s) - ‘/Users/ryanhurst/Desktop/dataFinal/data/time_series_covid19_confirmed_US.csv’ saved [69971]

--2020-05-06 13:41:13--  https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Lengt

# Examining Our Files:
### In order to perform EDA, we need to load our datasets into the notebook. 
###### Once we have our files, we need to examine them and ask the following questions:
   * How much data do we have and how has that grown?
   * How is the data structured, formatted, and organized?
   * What fields do we have to analyze in each record?

In [33]:
# We will load our data and use the methods provided below to 
# create dataframes to assist in EDA

In [35]:
from pathlib import Path
import bz2
import os

In [36]:
def list_files(directory):
    """
    Return a list of pathlib.Path objects for the files in the directory.
    
    directory: a string describing the directory to list 
        for example 'data/'
    """
    file_list = []
    p = Path(directory)
    
    for child in p.iterdir():
        file_list.append(child)
    return file_list
        
    
def get_file_size(file_name):
    """
    Return file size for a given filename.
    """ 
    
    p = Path(file_name)
    return p.stat().st_size
    
    

def get_linecount_bz2(file_name):
    """
    Returns the number of lines in bz2 file.  
    """ 
    counter = 0
    with bz2.open(file_name, "r") as f:
        
        for i in f:
            counter +=1
        return counter
            


### To guide our interpretation of incoming data, we will create a dataFrame that shows us the name, size, and linecount of each updated file 

In [46]:
info = []
for f in list_files("data/"):
    name = str(f)
    if name[-3:] == "bz2": 
        size = get_file_size(f)
        linecount = get_linecount_bz2(f)
        info.append({"name": name, "size": size, "linecount": linecount})

file_info = pd.DataFrame(info).sort_values("size")
file_info

  """Entry point for launching an IPython kernel.


Unnamed: 0,0
0,<!DOCTYPE html>
1,"<html lang=""en"">"
2,<head>
3,"<meta charset=""utf-8"">"
4,"<link rel=""dns-prefetch"" href=""https://github...."


### Reading in the files:
##### Now that we have an idea of the new data, we will load it in for further EDA

In [49]:
#US_daily_report = pd.read_csv(r"data/05-04-2020.csv")
#US_confirmed_deaths = pd.read_csv(r"data/time_series_covid19_deaths_US.csv")
#US_confirmed_cases = pd.read_csv(r"data/time_series_covid19_confirmed_US.csv")


FileNotFoundError: [Errno 2] File data/05-04-2020.csv does not exist: 'data/05-04-2020.csv'

# Identifying Issues with our Data
### For each data set, we will load data and identify issues in our data that will be used for cleaning

## Cleaning Data:
#### We will use the issues we identified with with our data to clean it


## Joining Tables: (transformation for EDA proccess)
#### To form a predictive analysis on our data, we will need to manipulate the tables to provide usefull information
###### Tables we are joining and why:
   * Table One
   * Table Two
   * Etc...
   

## EDA Visualization

#### In this section we will finally make observations about our transformed data and form visualizations to show said observations.

#### ******THIS SECTION SHOULD BE LONG --- SERVES TO VISUALIZE SOME OF THE FEATURES WE USE FOR OUR MODEL

## Training Validation Split
##### Split Data for model selection

# Model Selection: 
### What models are we going to use & Why?