In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import pyjanitor and pandas
from typing import Union
import janitor
import pandas as pd
import pandas_flavor as pf

In [3]:
# Supress user warnings when we try overwriting our custom pandas flavor functions
import warnings
warnings.filterwarnings('ignore')

In [4]:
fileurl = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_media_franchises'
df_raw = pd.read_html(fileurl)[1]
df_raw.head()

Unnamed: 0,Franchise,Year of inception,Total revenue (USD),Revenue breakdown (est.),Original media,Creator(s),Owner(s)
0,Pokémon,1996,est. $90 billion,Licensed merchandise – $61.1 billion[a] Video ...,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...
1,Hello Kitty,1974,est. $80 billion,Merchandise sales – $80 billion[j] Manga magaz...,Cartoon character[37],Yuko Shimizu Shintaro Tsuji,Sanrio
2,Winnie the Pooh,1924,est. $75 billion,Retail sales – $74.515 billion[n] DVD & Blu-ra...,Book[58],A. A. Milne E. H. Shepard,The Walt Disney Company
3,Mickey Mouse & Friends,1928,est. $70 billion,Retail sales – $69.85 billion[o] VHS & DVD – $...,Animated cartoon,Walt Disney Ub Iwerks,The Walt Disney Company
4,Star Wars,1977,est. $65 billion,Merchandise sales – $40.294 billion[r] Box off...,Film,George Lucas,Lucasfilm (The Walt Disney Company)


In [5]:
df_raw.clean_names().head()

Unnamed: 0,franchise,year_of_inception,total_revenue_usd_,revenue_breakdown_est_,original_media,creator_s_,owner_s_
0,Pokémon,1996,est. $90 billion,Licensed merchandise – $61.1 billion[a] Video ...,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...
1,Hello Kitty,1974,est. $80 billion,Merchandise sales – $80 billion[j] Manga magaz...,Cartoon character[37],Yuko Shimizu Shintaro Tsuji,Sanrio
2,Winnie the Pooh,1924,est. $75 billion,Retail sales – $74.515 billion[n] DVD & Blu-ra...,Book[58],A. A. Milne E. H. Shepard,The Walt Disney Company
3,Mickey Mouse & Friends,1928,est. $70 billion,Retail sales – $69.85 billion[o] VHS & DVD – $...,Animated cartoon,Walt Disney Ub Iwerks,The Walt Disney Company
4,Star Wars,1977,est. $65 billion,Merchandise sales – $40.294 billion[r] Box off...,Film,George Lucas,Lucasfilm (The Walt Disney Company)


### Section 1
Step 1.1 rename columns
> ```clean_money <- df %>% 
> set_names(nm = c("franchise", "year_created", "total_revenue", "revenue_items",
>                   "original_media", "creators", "owners")) %>%```

Step 1.2 clean up `total_revenue` column
> ```mutate(total_revenue = str_remove(total_revenue, "est."),
>         total_revenue = str_trim(total_revenue),
>         total_revenue = str_remove(total_revenue, "[$]"),
>         total_revenue = word(total_revenue, 1, 1),
>         total_revenue = as.double(total_revenue))```

In [6]:
# Section 1: Set up panda_flavor helper functions

@pf.register_dataframe_method
def str_remove(df, column_name: str = '', pattern: str = ''):
    """Wrapper around df.str.replace"""
    df[column_name] = df[column_name].str.replace(pattern, '')
    return df


@pf.register_dataframe_method
def str_trim(df, column_name: str):
    """Remove trailing and leading white space"""
    df[column_name] = df[column_name].str.strip()
    return df


@pf.register_dataframe_method
def str_slice(
    df, column_name: str = '', start: int = None, stop: int = None
):
    """
    Wrapper around `df.str.slice`
    """
    df[column_name] = df[column_name].str[start:stop]
    return df

@pf.register_dataframe_method
def as_type(df, column_name: str = '', datatype: Union[type, str] = float):
    """Wrapper around df.astype"""
    df[column_name] = df[column_name].astype(datatype)
    return df

In [7]:
# Step 1.1: rename columns
# Repliate R behavior:
# set_names(nm = c("franchise", "year_created", "total_revenue", "revenue_items",
#                  "original_media", "creators", "owners")) 
colnames = (
    'franchise', 'year_created', 'total_revenue', 'revenue_items',
    'original_media', 'creators', 'owners'
)
df_dirty = df_raw.rename(columns={
    col_old: col_new for col_old, col_new in zip(df_raw.columns, colnames)
})

In [8]:
# Step 1.2: clean up `total_revenue` column
# Replicate R behavior:
# mutate(total_revenue = str_remove(total_revenue, "est."),
#         total_revenue = str_trim(total_revenue),
#         total_revenue = str_remove(total_revenue, "[$]"),
#         total_revenue = word(total_revenue, 1, 1),
#         total_revenue = as.double(total_revenue))
column_name = 'total_revenue'
df_cleanmoney = (
    df_dirty.str_remove(column_name=column_name, pattern='est.')
        .str_trim(column_name=column_name)
        .str_remove(column_name=column_name, pattern='[$]')
        .str_slice(column_name=column_name, start=0, stop=2)
        .as_type(column_name=column_name, datatype=float)
)

In [9]:
# Check output
display(df_cleanmoney.head(3))

Unnamed: 0,franchise,year_created,total_revenue,revenue_items,original_media,creators,owners
0,Pokémon,1996,90.0,Licensed merchandise – $61.1 billion[a] Video ...,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...
1,Hello Kitty,1974,80.0,Merchandise sales – $80 billion[j] Manga magaz...,Cartoon character[37],Yuko Shimizu Shintaro Tsuji,Sanrio
2,Winnie the Pooh,1924,75.0,Retail sales – $74.515 billion[n] DVD & Blu-ra...,Book[58],A. A. Milne E. H. Shepard,The Walt Disney Company


### Section 2
>```clean_category <- clean_money %>% 
> separate_rows(revenue_items, sep = "\\[") %>% 
  filter(str_detect(revenue_items, "illion")) %>% 
  separate(revenue_items, into = c("revenue_category", "revenue"), sep = "[$]") %>% 
  mutate(revenue_category = str_remove(revenue_category, " – "),
         revenue_category = str_remove(revenue_category, regex(".*\\]")),
>        revenue_category = str_remove(revenue_category, "\n"))```

In [10]:
# Section 2: Set up panda_flavor helper functions
@pf.register_dataframe_method
def separate_rows(df, column_name: str = '', sep: str = ''):
    """
    Split cells (for the specified column) that contains a list of items
    that are separated by `sep` into individual rows
    This uses pyjanitor method `filter_string`
    """
    df['id'] = df.index
    wdf = (
        pd.DataFrame(
            df[column_name].str.split(sep).tolist()
        ).stack().reset_index()
    )
    wdf.rename(columns={'level_0': 'id', 0: 'revenue_items'}, inplace=True)
    wdf.drop(columns=['level_1'], inplace=True)
    # Preserve an id field for later merge
    return pd.merge(df, wdf, on="id", suffixes=("_drop", "")).drop(
        columns=["id", column_name + "_drop"]
    )

In [32]:
column_name = 'revenue_items'
df_cleancategory = (
    df_cleanmoney.separate_rows(column_name=column_name, sep='\[')
        .filter_string(column_name, 'illion')
        .deconcatenate_column(
            column_name=column_name,
            new_column_names=['revenue_category', 'revenue'], sep='–'
        )
        .str_remove(column_name='revenue_category', pattern=' – ')
        .str_remove(column_name='revenue_category', pattern='.*\]')
        .str_remove(column_name='revenue_category', pattern='\n')
)

In [33]:
df_cleancategory.head()

Unnamed: 0,franchise,year_created,total_revenue,original_media,creators,owners,revenue_items,revenue_category,revenue
0,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,Licensed merchandise – $61.1 billion,Licensed merchandise,$61.1 billion
1,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,a] Video games – $17.138 billion,Video games,$17.138 billion
2,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,b] Card game – $10.853 billion,Card game,$10.853 billion
3,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,c] Box office – $1.857 billion,Box office,$1.857 billion
4,Pokémon,1996,90.0,Video game,Satoshi Tajiri Ken Sugimori,Nintendo (trademark) The Pokémon Company (Nint...,d] Manga sales – $1.46 billion,Manga sales,$1.46 billion
