In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import pyjanitor and pandas
from typing import Union
import janitor
import pandas as pd
import pandas_flavor as pf

In [3]:
# Supress user warnings when we try overwriting our custom pandas flavor functions
import warnings
warnings.filterwarnings('ignore')

In [4]:
# try:
#     fileurl = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_media_franchises'
#     df_raw = pd.read_html(fileurl)[1]
# except:
filepath = '/Users/shandou/scipy2019_sprints/example_data_stash/dirty_media_franchises.csv'
df_raw = pd.read_csv(filepath)
display(df_raw.head())

Unnamed: 0,Franchise,Year of inception,Total revenue (USD),Revenue breakdown (est.),Original media,Creator(s),Owner(s)
0,Pokémon,1996,est. $90 billion,Licensed merchandise – $61.1 billion[a] Video ...,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...
1,Hello Kitty,1974,est. $80 billion,Merchandise sales – $80 billion[j] Manga magaz...,Cartoon character[37],Yuko Shimizu Shintaro Tsuji,Sanrio
2,Winnie the Pooh,1924,est. $75 billion,Retail sales – $74.515 billion[n] DVD & Blu-ra...,Book[58],A. A. Milne E. H. Shepard,The Walt Disney Company
3,Mickey Mouse & Friends,1928,est. $70 billion,Retail sales – $69.85 billion[o] VHS & DVD – $...,Animated cartoon,Walt Disney Ub Iwerks,The Walt Disney Company
4,Star Wars,1977,est. $65 billion,Merchandise sales – $40.294 billion[r] Box off...,Film,George Lucas,Lucasfilm (The Walt Disney Company)


### Section 1
Step 1.1 rename columns
> ```clean_money <- df %>% 
> set_names(nm = c("franchise", "year_created", "total_revenue", "revenue_items",
>                   "original_media", "creators", "owners")) %>%```

Step 1.2 clean up `total_revenue` column
> ```mutate(total_revenue = str_remove(total_revenue, "est."),
>         total_revenue = str_trim(total_revenue),
>         total_revenue = str_remove(total_revenue, "[$]"),
>         total_revenue = word(total_revenue, 1, 1),
>         total_revenue = as.double(total_revenue))```

In [5]:
# Section 1: Set up panda_flavor helper functions

@pf.register_dataframe_method
def str_remove(df, column_name: str, pattern: str = ''):
    """Wrapper around df.str.replace"""
    df[column_name] = df[column_name].str.replace(pattern, '')
    return df


@pf.register_dataframe_method
def str_trim(df, column_name: str):
    """Remove trailing and leading white space"""
    df[column_name] = df[column_name].str.strip()
    return df


@pf.register_dataframe_method
def str_slice(
    df, column_name: str, start: int = None, stop: int = None
):
    """
    Wrapper around `df.str.slice`
    """
    df[column_name] = df[column_name].str[start:stop]
    return df

In [6]:
# Step 1.1: rename columns
# Repliate R behavior:
# set_names(nm = c("franchise", "year_created", "total_revenue", "revenue_items",
#                  "original_media", "creators", "owners")) 
colnames = (
    'franchise', 'year_created', 'total_revenue', 'revenue_items',
    'original_media', 'creators', 'owners'
)
df_dirty = df_raw.rename(columns={
    col_old: col_new for col_old, col_new in zip(df_raw.columns, colnames)
})

In [7]:
# Step 1.2: clean up `total_revenue` column
# Generate `df_clean_money` (equivalent to `clean_money` in the R example)
# Replicate R behavior:
# mutate(total_revenue = str_remove(total_revenue, "est."),
#         total_revenue = str_trim(total_revenue),
#         total_revenue = str_remove(total_revenue, "[$]"),
#         total_revenue = word(total_revenue, 1, 1),
#         total_revenue = as.double(total_revenue))
column_name = 'total_revenue'
df_clean_money = (
    df_dirty.str_remove(column_name, pattern='est.')
        .str_trim(column_name)
        .str_remove(column_name, pattern='\$')
        .str_slice(column_name, start=0, stop=2)
        .change_type(column_name, float)
)

In [8]:
# Check `df_clean_money`
display(df_clean_money.head(3))

Unnamed: 0,franchise,year_created,total_revenue,revenue_items,original_media,creators,owners
0,Pokémon,1996,90.0,Licensed merchandise – $61.1 billion[a] Video ...,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...
1,Hello Kitty,1974,80.0,Merchandise sales – $80 billion[j] Manga magaz...,Cartoon character[37],Yuko Shimizu Shintaro Tsuji,Sanrio
2,Winnie the Pooh,1924,75.0,Retail sales – $74.515 billion[n] DVD & Blu-ra...,Book[58],A. A. Milne E. H. Shepard,The Walt Disney Company


### Section 2
>```clean_category <- clean_money %>% 
> separate_rows(revenue_items, sep = "\\[") %>% 
  filter(str_detect(revenue_items, "illion")) %>% 
  separate(revenue_items, into = c("revenue_category", "revenue"), sep = "[$]") %>% 
  mutate(revenue_category = str_remove(revenue_category, " – "),
         revenue_category = str_remove(revenue_category, regex(".*\\]")),
>        revenue_category = str_remove(revenue_category, "\n"))```

In [19]:
# Section 2: Set up panda_flavor helper functions
@pf.register_dataframe_method
def separate_rows(df, column_name: str, sep: str = ''):
    """
    Split cells (for the specified column) that contains a list of items
    that are separated by `sep` into individual rows
    This uses pyjanitor method `filter_string`
    """
    # Preserve an id field for later merge
    df['id'] = df.index
    wdf = (
        pd.DataFrame(
            df[column_name].str.split(sep).tolist()
        ).stack().reset_index()
    )
    # Preserve the same id field for merge
    wdf.rename(columns={'level_0': 'id', 0: 'revenue_items'}, inplace=True)
    wdf.drop(columns=['level_1'], inplace=True)
    return pd.merge(df, wdf, on="id", suffixes=("_drop", "")).drop(
        columns=["id", column_name + "_drop"]
    )

In [20]:
# Generate `df_clean_category` on top of `df_clean_money`
column_name = 'revenue_items'
df_clean_category = (
    df_clean_money.separate_rows(column_name, sep='\[')
        .filter_string(column_name, 'illion')
    # Split `revenue_items` column into two columns with pyjanitor function
    # `deconcatenate_column`
        .deconcatenate_column(
            column_name=column_name,
            new_column_names=['revenue_category', 'revenue'], sep='$'
        )
        .str_remove(column_name='revenue_category', pattern=' – ')
        .str_remove(column_name='revenue_category', pattern='.*\]')
        .str_remove(column_name='revenue_category', pattern='\n')
)

In [21]:
# Check df_clean_category
display(df_clean_category.head(3))

Unnamed: 0,franchise,year_created,total_revenue,original_media,creators,owners,revenue_items,revenue_category,revenue
0,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,Licensed merchandise – $61.1 billion,Licensed merchandise,61.1 billion
1,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,a] Video games – $17.138 billion,Video games,17.138 billion
2,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,b] Card game – $10.853 billion,Card game,10.853 billion


### Section 3

> ```
> clean_df <- clean_category %>% 
>  mutate(revenue_category = case_when(
>    str_detect(str_to_lower(revenue_category), "box office") ~ "Box Office",
>    str_detect(str_to_lower(revenue_category), "dvd|blu|vhs|home video|video rentals|video sales|streaming|home entertainment") ~ >"Home Video/Entertainment",
>    str_detect(str_to_lower(revenue_category), "video game|computer game|mobile game|console|game|pachinko|pet|card") ~ "Video >Games/Games",
>    str_detect(str_to_lower(revenue_category), "comic|manga") ~ "Comic or Manga",
>    str_detect(str_to_lower(revenue_category), "music|soundtrack") ~ "Music",
>    str_detect(str_to_lower(revenue_category), "tv") ~ "TV",
>    str_detect(str_to_lower(revenue_category), "merchandise|licens|mall|stage|retail") ~ "Merchandise, Licensing & Retail",
>    
>    TRUE ~ revenue_category))```

In [22]:
# Section 3: Set up panda_flavor helper functions
@pf.register_dataframe_method
def fuzzy_match_replace(
    df, column_name: str = '', mapper: dict = None
):
    """
    Fuzzy matching and replace strings in a given column
    """
    for k, v in mapper.items():
        condition = df[column_name].str.contains(k)
        # [janitor] update_where: update value when condition is True
        df = df.update_where(condition, column_name, v)
    return df

In [23]:
# Set up value mapper for revenue category aggregation
value_mapper = {
    'box office': 'Box Office',
    'dvd|blu|vhs|home video|video rentals|video sales|streaming|home entertainment': 'Home Video/Entertainment',
    'video game|computer game|mobile game|console|game|pachinko|pet|card': 'Video Games/Games',
    'comic|manga': 'Comic or Manga',
    'music|soundtrac': 'Music',
    'tv': 'TV',
    'merchandise|licens|mall|stage|retail': 'Merchandise, Licensing & Retail',
}

In [25]:
column_name = 'revenue_category'
df_clean_category = (
    df_clean_category.transform_column(column_name, str.lower)  # [janitor] convert to lower case
        .transform_column(column_name, str.strip)  # [janitor] strip leading/trailing white space
        .fuzzy_match_replace(column_name, mapper=value_mapper)  # <pandas_flavor>
)

### Section 4
Further clean up of the `revenue` column
>```%>% 
>  mutate(revenue = str_remove(revenue, "illion"),
>         revenue = str_trim(revenue),
>         revenue = str_remove(revenue, " "),
>         revenue = case_when(str_detect(revenue, "m") ~ paste0(str_extract(revenue, "[:digit:]+"), "e-3"),
>                             str_detect(revenue, "b") ~ str_extract(revenue, "[:digit:]+"),
>                             TRUE ~ NA_character_),
>         revenue = format(revenue, scientific = FALSE),
>         revenue = parse_number(revenue)) %>%
>  mutate(original_media = str_remove(original_media, "\\[.+"))
> ```

In [15]:
@pf.register_dataframe_method
def str_replace(df, column_name: str = '', old: str = '', new: str = ''):
    """Wrapper around df.str.replace"""
    df[column_name] = df[column_name].str.replace(old, new)
    return df

@pf.register_dataframe_method
def parse_number(df):
    """
    Check all columns of dataframe and properly parse numeric types
    """
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError as e:
            continue
    return df

In [27]:
# clean up revenue values
column_name = 'revenue'
df_clean = (
    df_clean_category.str_remove(column_name, 'illion')
        .str_trim(column_name)
        .str_remove(column_name, ' ')
        .str_replace(column_name, '\s*b', '')
        .str_replace(column_name, '\s*m', 'e-3')
        .parse_number()
        .str_remove('original_media', '\[.+')
)

In [30]:
# Yet to do further aggregation...