# GITHUB/PYTHON INTERFACE

## 1 - RUNNING THE PYTHON SCRIPT

In [4]:
%run pycovid.py

No existing files were updated
0 new files found. No further action necessary


# Script to generate a new Raw Dataset

In [4]:
import os
import pandas as pd
from datetime import datetime
from numpy import where
import pycovidfunc as cv

pd.options.display.max_columns = None
pd.options.display.max_rows = None

if 'config.csv' in os.listdir(os.getcwd()):
    config = pd.read_csv('config.csv',index_col='var').fillna('-')
else:
    raise FileNotFoundError('No configuration file "config.csv" found.')
    
who_data_dir = config.loc['who_data_dir'].path
who_file_list = os.listdir(who_data_dir)
for file in who_file_list:
    if not file.endswith('.csv'):
        who_file_list.remove(file)

try:
    df = cv.raw_data_formatter(who_file_list,who_data_dir)
    new_date = datetime.strftime(datetime.now().date(),format='%m-%d-%Y')

    raw_data_path = config.loc['raw_data'].path
    config.loc['raw_data'].last_update = new_date

    df.to_csv(raw_data_path, index=False)
    config.to_csv('config.csv')

    print('new database generated succesfully!')
except:
    print('process aborted. No new database generated.')

new database generated succesfully!


# Script to generate the consolidated dataset

In [5]:
df=pd.read_csv(raw_data_path)

In [6]:
country_report = cv.world_data_formatter(df)
country_report.to_json(config.loc['formatted_data'].path,orient='records')
print('World data report created succesfully!')

KeyboardInterrupt: 

## 2 - PYTHON PYCOVID.PY SCRIPT

In [1]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta
from pycovid import pycovidfunc as cv

git_dir = r"C:\Program Files\Git\cmd"
git_bin = os.path.join(git_dir, "git")

os.putenv("GIT_PYTHON_GIT_EXECUTABLE", git_bin)
os.environ.putenv("GIT_PYTHON_GIT_EXECUTABLE", git_bin)

# Making sure that it is first in PATH
sys.path = [git_dir] + sys.path
os.environ["PATH"] = os.pathsep.join([git_dir]) + os.pathsep + os.environ["PATH"]

# Only import git now, because that's when the path is checked!
import git

# Read the config file to check for data file information:

if 'config.csv' in os.listdir(os.getcwd()):
    config = pd.read_csv('config.csv',index_col='var').fillna('-')
else:
    raise FileNotFoundError('No configuration file "config.csv" found.')

who_data_dir = config.loc['who_data_dir'].path
repo = git.Repo(config.loc['git_repo'].path)
upstream_repo = repo.remotes.upstream

# Pull upstream base repo and check for modified files:
lm_frame = cv.get_date_modified(who_data_dir)

g = git.Git(upstream_repo)
g.pull('upstream','master')

repo = git.Repo(config.loc['git_repo'].path)
lm_after_git_pull = cv.get_date_modified(who_data_dir)

count_modified = 0
for idx in lm_frame.index:
    new_last_modified = lm_after_git_pull.loc[idx].last_modified
    if lm_frame.loc[idx].last_modified != new_last_modified:
        count_modified += 1

who_file_list = os.listdir(who_data_dir)
for file in who_file_list:
    if not file.endswith('.csv'):
        who_file_list.remove(file)


flag = True # flag to indicate update
report = []
if count_modified != 0:
    report.append('{} existing file(s) were updated since last pull'.format(count_modified))
    report.append('generating new database...')
    try:
        df = cv.raw_data_formatter(who_file_list,who_data_dir)
        new_date = pd.to_datetime(who_file_list[-1].split(sep='.')[0])
        last_update = datetime.strftime(new_date,format='%m-%d-%Y')
        
        raw_data_path = config.loc['raw_data'].path
        config.loc['raw_data'].last_update = new_date

        df.to_csv(raw_data_path, index=False)
        config.to_csv('config.csv')

        report.append('new database generated succesfully!')
    except:
        print('process aborted. No new database generated.')
else:
    last_update = pd.to_datetime(config.loc['raw_data'].last_update)
    latest_who_file_date = pd.to_datetime(who_file_list[-1].split(sep='.')[0])

    files_to_update = (latest_who_file_date - last_update).days

    # Generating the list of new files to update the database
    if files_to_update != 0:
        list_of_new_files = []
        for i in list(range(1,files_to_update + 1)):
            new_date = datetime.strftime((last_update
                                          + timedelta(days=i)).date(),
                                          format='%m-%d-%Y')
            list_of_new_files.append(new_date + '.csv')
    
        # Generating a dataframe with new information:
        df = cv.raw_data_formatter(list_of_new_files,who_data_dir)

        # Appending the new data to existing raw data file and updating
        # the raw data information in the config file:

        raw_data_path = config.loc['raw_data'].path
        config.loc['raw_data'].last_update = new_date

        df.to_csv(raw_data_path, mode='a', index=False, header=None)
        config.to_csv('config.csv')
        print('No existing files were updated')
        print('%d new file(s) found. All files appended into the raw data file' 
              % (files_to_update))
    else:
        flag = False
        print('No existing files were updated')
        print('0 new files found. No further action necessary')

# Create the world data report from the raw data if any update in the raw data file:
if flag:
    print('Creating world data file...')
    try:
        df = pd.read_csv(config.loc['raw_data'].path)
        country_report = cv.world_data_formatter(df)
        country_report.to_json(config.loc['formatted_data'].path,orient='records')
        print('World data report created succesfully!')

        new_date = pd.to_datetime(who_file_list[-1].split(sep='.')[0])
        last_update = datetime.strftime(new_date,format='%m-%d-%Y')

        config.loc['formatted_data'].last_update = last_update
        config.to_csv('config.csv')
        
        # Commit changes to github:
        print('-----------')
        print('list of diff on github repository:')
        print(repo.git.diff(None, name_only=True))
        print('commit to github repository')
        cv.commit_to_repo(repo)
        cv.repo_info(repo)
    except:
        print('World data report creation aborted. Please verify the raw data file.')    

No existing files were updated
1 new file(s) found. All files appended into the raw data file
Creating world data file...
World data report created succesfully!
-----------
list of diff on github repository:
Tableau/Covid19.twbx
consolidated_data/country_report.json
consolidated_data/raw_data.csv
python/backup/Covid19.ipynb
python/backup/config.csv
python/backup/github_python_interface.ipynb
python/backup/notebooks/.ipynb_checkpoints/Covid19-checkpoint.ipynb
python/backup/notebooks/.ipynb_checkpoints/github_python_interface-checkpoint.ipynb
python/backup/notebooks/.ipynb_checkpoints/raw_data-checkpoint.ipynb
python/backup/notebooks/Covid19.ipynb
python/backup/notebooks/config.csv
python/backup/notebooks/github_python_interface.ipynb
python/backup/notebooks/pycovid.bat
python/backup/notebooks/pycovid/__pycache__/pycovid.cpython-38.pyc
python/backup/notebooks/pycovid/__pycache__/pycovidfunc.cpython-37.pyc
python/backup/notebooks/pycovid/__pycache__/pycovidfunc.cpython-38.pyc
python/backu

In [5]:
headcommit = repo.head.commit

In [1]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta

git_dir = r"C:\Program Files\Git\cmd"
git_bin = os.path.join(git_dir, "git")

os.putenv("GIT_PYTHON_GIT_EXECUTABLE", git_bin)
os.environ.putenv("GIT_PYTHON_GIT_EXECUTABLE", git_bin)

# Making sure that it is first in PATH
sys.path = [git_dir] + sys.path
os.environ["PATH"] = os.pathsep.join([git_dir]) + os.pathsep + os.environ["PATH"]

# Only import git now, because that's when the path is checked!
import git

# Read the config file to check for data file information:

if 'config.csv' in os.listdir(os.getcwd()):
    config = pd.read_csv('config.csv',index_col='var').fillna('-')
else:
    raise FileNotFoundError('No configuration file "config.csv" found.')

who_data_dir = config.loc['who_data_dir'].path
repo = git.Repo(config.loc['git_repo'].path)
upstream_repo = repo.remotes.upstream

In [39]:
def commit_to_repo(repo, message=None, log=None):
    '''
    This function commits to the git repository active branch.

    Parameters
    ----------
    repo: obj, gitpython
        gitpython object containing the git repository data
    '''
    import os
    import sys

    git_dir = r"C:\Program Files\Git\cmd"
    git_bin = os.path.join(git_dir, "git")

    os.putenv("GIT_PYTHON_GIT_EXECUTABLE", git_bin)
    os.environ.putenv("GIT_PYTHON_GIT_EXECUTABLE", git_bin)

    # Making sure that it is first in PATH
    sys.path = [git_dir] + sys.path
    os.environ["PATH"] = os.pathsep.join([git_dir]) + os.pathsep + os.environ["PATH"]

    # Only import git now, because that's when the path is checked!
    import git
    from datetime import datetime

    # Creating commit information for repo index:
    now_str = datetime.now()
    now_str = datetime.strftime(now_str, format='%Y-%m-%d %Hh%Mm')

    if message != None:
        summary = message
    else:
        summary = "automated update {}".format(now_str)
    
    if log is None:
        log=[]
    
    try:
        repo.git.add(update=True)
        repo.index.commit(summary)
        origin = repo.remote(name='origin')
        origin.push()
        log.append('----\n')
        log.append('Commit process succesfull\n')
        log.append('----\n')
    except:
        log.append('----\n')
        log.append('Not able to commit. Please check git information\n')
        log.append('----\n')
    
    return log

In [61]:
def repo_info(repo,log=None):
    '''
    This function returns the information of the git repository. This algorithm
    is a direct adaptation of the one presented at:

    https://www.fullstackpython.com/blog/first-steps-gitpython.html
    '''
    import os
    repo_path = os.getenv('GIT_REPO_PATH')
    # Repo object used to programmatically interact with Git repositories
    
    if log is None:
        log=[]
    
    # check that the repository loaded correctly
    if not repo.bare:
        log.append('Repo at {} successfully loaded.\n'.format(repo_path))
        log.append('Repo local path: {}\n'.format(repo.git.working_dir))
        log.append('Repo description: {}\n'.format(repo.description))
        log.append('Repo active branch: {}\n'.format(repo.active_branch))
        for remote in repo.remotes:
            log.append('Remote named "{}" with URL "{}"\n'.format(remote, remote.url))
        log.append('Last commit for repo: {}.\n'.format(str(repo.head.commit.hexsha)))

        # take the last commit then print some information
        commits = list(repo.iter_commits('master'))[:1]

        for commit in commits:
            log.append('----\n')
            log.append('commit: {}\n'.format(str(commit.hexsha)))
            log.append("\"{}\" by {} ({})\n".format(commit.summary,
                                             commit.author.name,
                                             commit.author.email))
            log.append(str(commit.authored_datetime)+'\n')
            log.append(str("count: {} and size: {}".format(commit.count(),
                                                      commit.size))+'\n')
    
    return log

In [62]:
report=[]
report.append('str1\n')
report.append('str2\n')

In [63]:
log=repo_info(repo,log=report)

In [65]:
f = open('teste.txt','+a')
f.writelines(log)
f.close()