Skip to content

Commit

Permalink
Merge pull request #70 from scrapinghub/plot_with_new_plotly
Browse files Browse the repository at this point in the history
Use new 3.8.0 plotly api, fixes #63
  • Loading branch information
manycoding committed Apr 18, 2019
2 parents 60c8aa8 + 33c02f7 commit e7fa977
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 82 deletions.
10 changes: 9 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,20 @@ Note that the top-most release is changes in the unreleased master branch on Git

[Keep a Changelog](https://keepachangelog.com/en/1.0.0/), [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.4.0.dev] (Work In Progress)
## [0.3.2dev] (Work In Progress)
### Added
- Allow reading private raw schemas directly from bitbucket, #58

### Changed
- Progress widgets are removed before printing graphs
- New plotly v4 API

### Fixed
- Failing `Compare Prices For Same Urls` when url is `nan`, #67
- Empty graphs in Jupyter Notebook, #63

### Removed
- Scraped Items History graphs


## [0.3.1] (2019-04-12)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
install_requires = (
"pandas",
"scrapinghub[msgpack]",
"plotly",
"plotly>=3.8.0",
"genson",
"boto3",
"jsonschema[format]>=3.0.0",
Expand Down
9 changes: 4 additions & 5 deletions src/arche/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
__version__ = "0.3.1"
SH_URL = "https://app.scrapinghub.com/p" # noqa

from _plotly_future_ import v4 # noqa
from arche.arche import Arche
from arche.readers.items import CollectionItems, JobItems
from arche.rules.duplicates import find_by as find_duplicates_by
from arche.tools.schema import basic_json_schema
from IPython.display import display, HTML
from plotly.offline import init_notebook_mode
import plotly.io as pio

pio.renderers.default = "notebook_connected+plotly_mimetype"

__all__ = [
"basic_json_schema",
Expand All @@ -22,6 +24,3 @@
logging.getLogger("botocore").setLevel(logging.CRITICAL)
logging.getLogger("HubstorageClient").setLevel(logging.CRITICAL)
logging.getLogger().setLevel(logging.DEBUG)

init_notebook_mode(connected=True)
display(HTML("<script src='https://cdn.plot.ly/plotly-latest.min.js'></script>"))
4 changes: 3 additions & 1 deletion src/arche/arche.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from arche.rules.others import compare_boolean_fields, garbage_symbols
import arche.rules.price as price_rules
from arche.tools import api, helpers, maintenance, schema
import IPython

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -127,6 +128,7 @@ def basic_json_schema(self, items_numbers: List[int] = None):

def report_all(self):
self.run_all_rules()
IPython.display.clear_output()
self.report.write_summaries()
self.report.write("\n" * 2)
self.report.write_details(short=True)
Expand All @@ -151,7 +153,7 @@ def data_quality_report(self, bucket: Optional[str] = None):
self.schema, items_dicts=self.source_items.dicts, fast=False
)
)

IPython.display.clear_output()
DataQualityReport(self.source_items, self.schema, self.report, bucket)

@lru_cache(maxsize=32)
Expand Down
23 changes: 6 additions & 17 deletions src/arche/data_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import arche.rules.price as price_rules
from arche.tools import api
from arche.tools.s3 import upload_str_stream
import plotly
import plotly.io as pio


class DataQualityReport:
Expand Down Expand Up @@ -100,21 +100,14 @@ def create_figures(self, items, items_dicts):
self.scraped_fields_coverage(items.job.key, cleaned_df)
self.coverage_by_categories(cleaned_df, tagged_fields)

def plot_to_notebook(self):
def plot_to_notebook(self) -> None:
for fig in self.figures:
plotly.offline.iplot(fig)
pio.show(fig)

def plot_html_to_stream(self):
def plot_html_to_stream(self) -> StringIO:
output = StringIO()
output.write(
'<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>\n'
)
for fig in self.figures:
output.write(
plotly.offline.plot(
fig, include_plotlyjs=False, output_type="div", show_link=False
)
)
output.write(pio.to_html(fig, include_plotlyjs="cdn", full_html=False))
output.write("\n")
output.write(self.appendix)
return output
Expand Down Expand Up @@ -185,12 +178,8 @@ def scraped_fields_coverage(self, job, df):
sfc = graphs.scraped_fields_coverage(job, df)
self.figures.append(sfc)

def scraped_items_history(self, job_no, job_numbers, date_items):
sih = graphs.scraped_items_history(job_no, job_numbers, date_items)
self.figures.append(sih)

def coverage_by_categories(self, df, tagged_fields):
"""Makes tables which show the number of items per category,
"""Make tables which show the number of items per category,
set up with a category tag
Args:
Expand Down
49 changes: 2 additions & 47 deletions src/arche/figures/graphs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import plotly.graph_objs as go


def scraped_fields_coverage(job, df):
def scraped_fields_coverage(job, df) -> go.FigureWidget:
coverage_values = df.count().sort_values(ascending=True).values / len(df) * 100
fields = df.count().sort_values(ascending=True).index

Expand Down Expand Up @@ -40,49 +40,4 @@ def scraped_fields_coverage(job, df):
)

layout["annotations"] = annotations

fig = go.Figure(data=[trace], layout=layout)
return fig


def scraped_items_history(job_no, job_numbers, date_items):
prod_x_data = [date_items[i].keys()[0] for i in range(len(date_items))]
prod_y_data = [date_items[i].values()[0] for i in range(len(date_items))]
bar_colors = []
for job in job_numbers:
if job != job_no:
bar_colors.append("rgb(204,204,204)")
else:
bar_colors.append("rgb(112,194,99)")

trace = go.Bar(
x=prod_x_data,
y=prod_y_data,
text=job_numbers,
marker=dict(color=bar_colors),
name="prod",
)

layout = go.Layout(
title="<b>Scraped Items History</b>",
margin=dict(t=25, b=25, l=25, r=25),
xaxis=dict(
title="Run Date",
showgrid=False,
showline=False,
showticklabels=True,
zeroline=False,
domain=[0.1, 1],
),
yaxis=dict(
title="Number of Scraped Items",
showgrid=False,
showline=False,
showticklabels=True,
zeroline=False,
domain=[0.1, 1],
),
)

fig = go.Figure(data=[trace], layout=layout)
return fig
return go.FigureWidget(data=[trace], layout=layout)
16 changes: 8 additions & 8 deletions src/arche/figures/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import plotly.graph_objs as go


def score_table(quality_estimation, field_accuracy):
def score_table(quality_estimation, field_accuracy) -> go.FigureWidget:
cells = [
["<b>Field Accuracy Score</b>", "<b>Overall Quality Score</b>"],
["<b>" + str(field_accuracy) + "<b>", "<b>" + str(quality_estimation) + "</b>"],
Expand All @@ -23,7 +23,7 @@ def score_table(quality_estimation, field_accuracy):
)

layout = go.Layout(autosize=True, margin=dict(l=0, t=25, b=25, r=0), height=150)
return dict(data=[trace], layout=layout)
return go.FigureWidget(data=[trace], layout=layout)


def get_color(value):
Expand All @@ -34,7 +34,7 @@ def get_color(value):
return "rgb(233,81,51)"


def job_summary_table(job):
def job_summary_table(job) -> go.FigureWidget:
job_url = f"{SH_URL}/{job.key}"
job_state = api.get_job_state(job)
job_close_reason = api.get_job_close_reason(job)
Expand Down Expand Up @@ -120,7 +120,7 @@ def job_summary_table(job):
height=445,
)

return dict(data=[trace], layout=layout)
return go.FigureWidget(data=[trace], layout=layout)


def rules_summary_table(
Expand All @@ -138,7 +138,7 @@ def rules_summary_table(
no_of_checked_price_items,
no_of_price_warns,
**kwargs,
):
) -> go.FigureWidget:
test_name_values = ["Adherence to schema"]
tested_fields_values = ["All scraped fields" for i in range(1)]
test_results_values = [f"{no_of_validation_warnings} warnings"]
Expand Down Expand Up @@ -204,7 +204,7 @@ def rules_summary_table(
margin=dict(t=25, b=25, l=0, r=0),
height=100 + len(df.index) * 25,
)
return go.Figure(data=[trace], layout=layout)
return go.FigureWidget(data=[trace], layout=layout)


def get_rule_status(err_values_number):
Expand All @@ -213,7 +213,7 @@ def get_rule_status(err_values_number):
return "Pass"


def coverage_by_categories(category_field, df, product_url_fields):
def coverage_by_categories(category_field, df, product_url_fields) -> go.FigureWidget:
if category_field not in df.columns:
return None
if df[category_field].notnull().sum() == 0:
Expand Down Expand Up @@ -264,4 +264,4 @@ def coverage_by_categories(category_field, df, product_url_fields):
margin=dict(t=30, b=25, l=0, r=0),
height=(len(category_names) + 2) * 45,
)
return dict(data=[trace], layout=layout)
return go.FigureWidget(data=[trace], layout=layout)
4 changes: 2 additions & 2 deletions src/arche/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from IPython.display import display, HTML
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import plot
import plotly.io as pio


class Report:
Expand Down Expand Up @@ -100,7 +100,7 @@ def plot(stat: Stat) -> None:
layout.annotations = Report.make_annotations(stat)

f = go.FigureWidget(data, layout)
display(HTML(plot(f, include_plotlyjs=False, output_type="div")))
pio.show(f)

@staticmethod
def make_annotations(stat: Stat) -> List[Dict]:
Expand Down

0 comments on commit e7fa977

Please sign in to comment.