# Little helpers for EDA

In [None]:
import numpy as np
import json

%matplotlib inline

# missingno
> Messy datasets? Missing values? missingno provides a small toolset of flexible and easy-to-use missing data visualizations and utilities that allows you to get a quick visual summary of the completeness (or lack thereof) of your dataset.
> Just pip install missingno to get started.
> -- https://github.com/ResidentMario/missingno

In [None]:
from quilt.data.ResidentMario import missingno_data


collisions = missingno_data.nyc_collision_factors()
collisions = collisions.replace("nan", np.nan)

In [None]:
import missingno as msno


msno.matrix(collisions.sample(250))

# pandas_profiling
> Generates profile reports from a pandas DataFrame. The pandas df.describe() function is great but a little basic for serious exploratory data analysis.
> --https://github.com/pandas-profiling/pandas-profiling

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport

In [None]:
df = pd.read_csv("data_v1.csv")

In [None]:
report = ProfileReport(df)

In [None]:
report.to_file("report_v1.html")

# great_expectations
> Great Expectations is a framework that helps teams save time and promote analytic integrity with a new twist on automated testing: pipeline tests. Pipeline tests are applied to data (instead of code) and at batch time (instead of compile or deploy time).
> -- https://github.com/great-expectations/great_expectations

In [None]:
import great_expectations as ge

In [None]:
df = ge.from_pandas(df)

type(df)

In [None]:
df.head()

In [None]:
df.Name.unique()

In [None]:
df.expect_column_values_to_be_in_set(
    "Name",
    set(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']),
)

In [None]:
df.expect_column_min_to_be_between(
    "PetalWidth",
    min_value=0.1,
)

In [None]:
# there are many, many expectations!
# and you can write your own!
df.expec

In [None]:
df.validate()

In [None]:
df.save_expectations_config("expectations.json")

## Load and validate new data

In [None]:
with open("expectations.json") as f:
    expectations = json.load(f)

df_v2 = ge.read_csv(
    "data_v1.csv",
    expectations_config=expectations,
)
df_v2.validate(only_return_failures=True)

In [None]:
ge.validate(
    pd.read_csv("data_v1.csv"),
    expectations_config=expectations,
    only_return_failures=True,
)

## Load and validate "unexpected data"

In [None]:
df = pd.read_csv("data_v1.csv")
df.loc[0, "PetalWidth"] = 0

In [None]:
ge.validate(
    df,
    expectations_config=expectations,
    only_return_failures=True,
)