# Pré-Requisitos

## PROV - https://pypi.org/project/prov/

In [None]:
!pip install prov

## Sweetviz - https://pypi.org/project/sweetviz/

In [None]:
!pip install sweetviz

# Bibliotecas

In [18]:
import sweetviz as sv
import pandas as pd
import prov
import bs4

# Reprodutibilidade

## Mostra as versões das bibliotecas usadas

In [None]:
import sys

print('Versão do Pandas:', pd.__version__)
print('Versão do Beautiful Soup:', bs4.__version__)
print('Versão do PROV:', prov.__version__)
print('Versão do Sweetviz:', sv.__version__)
print('Versão do Python:', sys.version)

## Sistema Operacional

In [None]:
!cat /etc/os-release

### Versão do kernel

In [None]:
!cat /proc/version

### Arquitetura (32 ou 64 bits)

In [None]:
!lscpu

# Provenance

In [None]:
# Create a new provenance document
doc = prov.model.ProvDocument()
# Declaring namespaces
doc.set_default_namespace('https://github.com/ppgi-ufrj-data-science/FakeNews')
doc.add_namespace('kagg', 'https://www.kaggle.com/')
# Criando entidades
dataset = doc.entity('kagg:fake-news', {prov.model.PROV_TYPE : 'kagg:dataset', prov.model.PROV_LOCATION : 'https://www.kaggle.com/mrisdal/fake-news',
                                        'kagg:title' : 'Getting Real about Fake News', 'kagg:resume' : 'Text & metadata from fake & biased news sources around the web',
                                        'kagg:usability' : '7.1', 'kagg:license' : 'CC0: Public Domain', 'kagg:tags' : 'arts and entertainment, news, politics, languages',
                                        'kagg:records' : '12.999', 'kagg:columns' : '20'})
zip = doc.entity('kagg:FakeNewData.zip', {prov.model.PROV_TYPE : 'File', 'kagg:size' : '19.4 MB'})
csv = doc.entity('kagg:fake.csv', {prov.model.PROV_TYPE : 'File', 'kagg:size' : '54.05 MB'})
# Criando agentes
meg = doc.agent('kagg:Meg Risdal', {prov.model.PROV_TYPE : 'prov:Person', 'kagg:position': 'Product Manager', prov.model.PROV_LOCATION : 'Los Angeles, California, United States',
                                    'kagg:user' : 'mrisdal', 'kagg:profile' : 'https://www.kaggle.com/mrisdal'})
bsdet = doc.agent('B.S. Detector', {prov.model.PROV_TYPE : 'prov:SoftwareAgent', 'prov:Publisher' : 'The Self Agency, LLC', 'prov:Creator' : 'Daniel Sieradski',
                                    prov.model.PROV_LOCATION : 'https://addons.mozilla.org/en-US/firefox/addon/bsdetector/'})
api = doc.agent('webhose.io API', {prov.model.PROV_TYPE : 'prov:SoftwareAgent', 'prov:Publisher' : 'Webhose', prov.model.PROV_LOCATION : 'https://webhose.io/'})
# Criando atividades
scrap = doc.activity('kagg:Scrap Text and Metadata', '25/10/2016', '25/11/2016', {'kagg:source' : '244 websites'})
# Relacionamentos
doc.wasInfluencedBy(scrap, api)
doc.wasInfluencedBy(scrap, bsdet)
scrap.wasAssociatedWith(meg)
dataset.wasAttributedTo(meg)
dataset.wasGeneratedBy(scrap)
zip.wasDerivedFrom(dataset)
csv.wasDerivedFrom(zip)
print(doc.get_provn())

## Gera o gráfico

In [None]:
# Save the provenance graph
dot = prov.dot.prov_to_dot(doc)
dot.write_png('fake-news-dataset-prov.png')
# Visualize the graph file
from IPython.display import Image
Image('fake-news-dataset-prov.png')