# Milestone 3 - Great Expectation

# Instantiate Data Context

In [1]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# Connect to Data Source

In [2]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-feb'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'anime-februari'
path_to_data = 'C:\FTDS\MS\p2-ftds012-hck-m3-samueltatsu\dags\P2M3_samuel_surja_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# Create Expectation Suite

In [3]:
# Creat an expectation suite
expectation_suite_name = 'expectation-anime-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,anime_id,name,score,genres,english_name,japanese_name,synopsis,type,episodes,aired,...,duration_sec_per_ep,rating,ranked,popularity,members,favorites,watching,completed,on_hold,dropped
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,"In the year 2071, humanity has colonized sever...",TV,26,"Apr 3, 1998 to Apr 24, 1999",...,1440,R - 17+ (violence & profanity),28,39,1251960,61971,105808,718161,71513,26678
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,"other day, another bounty—such is the life of ...",Movie,1,"Sep 1, 2001",...,3660,R - 17+ (violence & profanity),159,518,273145,1174,4143,208333,1935,770
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,"Vash the Stampede is the man with a $$60,000,0...",TV,26,"Apr 1, 1998 to Sep 30, 1998",...,1440,PG-13 - Teens 13 or older,266,201,558913,12944,29113,343492,25465,13925
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),ches are individuals with special powers like ...,TV,26,"Jul 2, 2002 to Dec 24, 2002",...,1500,PG-13 - Teens 13 or older,2481,1467,94683,587,4300,46165,5121,5378
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,It is the dark century and the people are suff...,TV,52,"Sep 30, 2004 to Sep 29, 2005",...,1380,PG - Children,3710,4369,13224,18,642,7314,766,1108


# Expectations

**1. `anime_id` must be unique**

In [15]:
validator.expect_column_values_to_be_unique('anime_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 14952,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**2. `score` must be between 0 and 10**

Score or rating is supposed to be between 0.00 and 10.00. Any value not within that range is invalid.

In [16]:
validator.expect_column_values_to_be_between(
    column='score', min_value=0, max_value=10
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 14952,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**3. `type` must be in set ['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music', 'Unknown']**

Anime are widely categorized into 5 types: TV, Movie, OVA, Special, ONA. There's a new category with the rising popularity of Music genre. Then we include 'Unknown' for the possibly not yet categorized anime title.

In [17]:
validator.expect_column_values_to_be_in_set(
    column='type', 
    value_set=['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music', 'Unknown']
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 14952,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**4. `episodes` must be integer type**

Functionally, episodes should have a discrete numerical value.

In [18]:
validator.expect_column_values_to_be_in_type_list('episodes', ['int64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**5. At least 90% entry must have unique `japanese_name`**

Anime industry is dominated by japanese studios, producers, companies, etc. We can expect most anime titles have an original japanese name.

In [12]:
validator.expect_column_proportion_of_unique_values_to_be_between(
    column='japanese_name',
    min_value=0.9,  # Expect at least 90% entry have japanese title
    max_value=1,  # At most all entry have unique japanese title
)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.9481005885500268
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**6. `synopsis` must be less than 6000 characters**

The maximum number of characters allowed to write a synopsis is 6000 characters. We want to make sure our data is valid and follows that guideline.

In [13]:
validator.expect_column_value_lengths_to_be_between(
    column='synopsis',
    min_value=None, # no minimum length requirement
    max_value=6000, # max length allowed for synopsis
)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 14952,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

**7. `studios` column should contain top 5 studios**

Toei Animation, Sunrise, J.C.Staff, Madhouse, TMS Entertainment. These 5 studios are top contributors with most anime titles released. We want to make sure our data include these 5 no matter what.

In [14]:
validator.expect_column_distinct_values_to_contain_set(
    column='studios',
    value_set=['Toei Animation', 'Sunrise', 'J.C.Staff', 'Madhouse', 'TMS Entertainment']
)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "10Gauge",
      "10Gauge, Studio DURIAN",
      "2:10 AM Animation",
      "8bit",
      "A-1 Pictures",
      "A-1 Pictures, Bridge",
      "A-1 Pictures, Bridge, CloverWorks",
      "A-1 Pictures, Lapin Track",
      "A-1 Pictures, Ordet",
      "A-1 Pictures, TROYCA",
      "A-1 Pictures, Trigger, CloverWorks",
      "A-Line",
      "A-Real",
      "A.C.G.T.",
      "ACC Production",
      "AIC",
      "AIC ASTA",
      "AIC Build",
      "AIC Classic",
      "AIC Frontier",
      "AIC PLUS+",
      "AIC Spirits",
      "AIC Spirits, Asread",
      "AIC Spirits, BeSTACK",
      "AIC Spirits, Digital Frontier",
      "AIC Spirits, Group TAC",
      "AIC Takarazuka",
      "AIC, APPP",
      "AIC, Artmic",
      "AIC, Artmic, Darts",
      "AIC, Artmic, animate Film",
      "AIC, Darts",
      "AIC, Magic Bus, Ashi Production",
      "AIC, Remic",
      "AIC, Studio Hakk",
      "AIC, Studio Kyuuma",
      "AIC, animate

### Save into Expectation Suite

In [20]:
validator.save_expectation_suite(discard_failed_expectations=False)

### Build Data Docs

In [21]:
context.build_data_docs()

{'local_site': 'file://c:\\FTDS\\MS\\p2-ftds012-hck-m3-samueltatsu\\gx\\uncommitted/data_docs/local_site/index.html'}