In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
juvenile_df = pd.read_csv('../Data/juvenile.csv', dtype = {5: str, 7: str}, header = None)

## Rename Columns

In [3]:
juvenile_df.columns = ['UsageClass', 'CheckoutType', 'MaterialType', 'CheckoutYear', 'CheckoutMonth', 'Checkouts', 'Title', 'ISBN', 'Creator', 'Subjects', 'Publisher', 'PublicationYear']

## Adjust data types and minor formatting changes

In [4]:
juvenile_df['MaterialType'] = juvenile_df.MaterialType.str.title()

In [5]:
juvenile_df['Checkouts'] = juvenile_df.Checkouts.str.replace(',', '').astype(int)

In [6]:
juvenile_df['CheckoutDate'] = juvenile_df['CheckoutYear'].astype(str) + '-' + juvenile_df['CheckoutMonth'].astype(str) + '-01'

In [7]:
juvenile_df = juvenile_df.drop(['CheckoutType', 'CheckoutYear', 'CheckoutMonth'], axis = 1)

## Clean & standardize titles

#### Split title into 'Title' and 'TitleNotes'
* Delimiters: '` / `' & '` [by] `'

In [8]:
titleSplit1 = juvenile_df.Title.str.split(' / ', expand = True, n = 1).rename(columns = {0: 'Title1', 1: 'TitleNotes1'})

In [9]:
titleSplit2 = juvenile_df.Title.str.split(' / ', expand = True, n = 1)[0].str.split(' [by] ', regex = False, expand = True, n = 1).rename(columns = {0: 'Title2', 1: 'TitleNotes2'})

In [10]:
clean_titles = pd.merge(titleSplit1, titleSplit2, left_index = True, right_index = True)

In [11]:
clean_titles['TitleNotes'] = clean_titles['TitleNotes1'].astype(str) + clean_titles['TitleNotes2'].astype(str).replace('None', '')

In [12]:
juvenile_df[['Title', 'TitleNotes']] = clean_titles[['Title2', 'TitleNotes']]

#### Capitalize titles
* Capitalize first letter of each word in a title
  * NOT just title case, which capitalizes the first letter after apostrophes
    * `[' '.join([w.capitalize() for w in x]) for x in juvenile_df.Title.str.split(' ')]` could handle that, but didn't do the following
  * Make sure first character after a ( is capitalized
  * Keep all-caps words in all-caps

In [13]:
titles = []
for title in juvenile_df.Title:
    prev_c = ''
    cap_title = []
    for c in title:
        if prev_c in [' ', '[', '(']:
            cap_title.append(c.upper())
        else:
            cap_title.append(c)
        prev_c = c
    cap_title = ''.join(cap_title)
    titles.append(cap_title)

In [14]:
juvenile_df['Title'] = titles

#### Remove extra comments
* `(Unabridged)`, `[Sound Recording]`, `[Videorecording]`, and `[Text (Large Print)]`  
Unabridged is assumed unless otherwise stated, and the format notes are captured by MaterialType

In [28]:
juvenile_df['Title'] = juvenile_df['Title'].str.replace('(Unabridged)', '').str.replace('[Sound Recording]', '').str.replace('[Videorecording]', '').str.replace('[Text (Large Print)]', '').str.strip().str.strip(':')

In [29]:
juvenile_df.loc[(juvenile_df.CreatorName == 'J. K. Rowling') & (juvenile_df.Title.str.contains('Stone')), 'Title'].unique()

array(["Harry Potter And The Sorcerer's Stone",
       "Haeri Pʻotʻŏ Wa Mabŏpsa Ŭi Tol = Harry Potter And The Philosopher's Stone",
       "Ha Li Bo Te : Shen Mi De Mo Fa Shi = Harry Potter And The Philosopher's Stone",
       "Christmas At Hogwarts : From Harry Potter And The Sorcerer's Stone",
       "Hali Bote Yu Mo Fa Shi = Harry Potter And The Philosopher's Stone",
       "Harry Potter And The Sorcerer's Stone: Harry Potter Series, Book 1",
       "해리 포터와 마법사의 돌 (Harry Potter And The Philosopher's Stone): Harry Potter Series, Book 1"],
      dtype=object)

In [32]:
juvenile_df.loc[juvenile_df.Title == "Harry Potter And The Sorcerer's Stone: Harry Potter Series, Book 1"].sort_values('CheckoutDate')

Unnamed: 0,UsageClass,MaterialType,Checkouts,Title,ISBN,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,TitleNotes,CreatorName
2540289,Digital,Ebook,16,Harry Potter And The Sorcerer's Stone: Harry P...,,J.K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2012-10-01,,J.K. Rowling
2537486,Digital,Audiobook,20,Harry Potter And The Sorcerer's Stone: Harry P...,,J.K. Rowling,"Fantasy, Juvenile Fiction, Juvenile Literature...",Pottermore,2012,2012-10-01,,J.K. Rowling
2575995,Digital,Ebook,15,Harry Potter And The Sorcerer's Stone: Harry P...,,J.K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2012-11-01,,J.K. Rowling
2573174,Digital,Audiobook,20,Harry Potter And The Sorcerer's Stone: Harry P...,,J.K. Rowling,"Fantasy, Juvenile Fiction, Juvenile Literature...",Pottermore,2012,2012-11-01,,J.K. Rowling
2594064,Digital,Ebook,11,Harry Potter And The Sorcerer's Stone: Harry P...,,J.K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2012-12-01,,J.K. Rowling
...,...,...,...,...,...,...,...,...,...,...,...,...
6164379,Digital,Audiobook,68,Harry Potter And The Sorcerer's Stone: Harry P...,9781781102633,J. K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2022-9-01,,J. K. Rowling
6285384,Digital,Ebook,16,Harry Potter And The Sorcerer's Stone: Harry P...,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore,2012,2023-1-01,,J. K. Rowling
6311512,Digital,Audiobook,79,Harry Potter And The Sorcerer's Stone: Harry P...,9781781102633,J. K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2023-1-01,,J. K. Rowling
6342231,Digital,Ebook,11,Harry Potter And The Sorcerer's Stone: Harry P...,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore,2012,2023-2-01,,J. K. Rowling


In [35]:
juvenile_df.loc[(juvenile_df.Title == "Harry Potter And The Sorcerer's Stone") & (juvenile_df.UsageClass == 'Digital')].sort_values('CheckoutDate')

Unnamed: 0,UsageClass,MaterialType,Checkouts,Title,ISBN,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,TitleNotes,CreatorName
147,Digital,Ebook,21,Harry Potter And The Sorcerer's Stone,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore,2012,2023-10-01,,J. K. Rowling
30715,Digital,Audiobook,96,Harry Potter And The Sorcerer's Stone,9781781102633,J. K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2023-10-01,,J. K. Rowling
52160,Digital,Ebook,20,Harry Potter And The Sorcerer's Stone,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore,2012,2023-11-01,,J. K. Rowling
73469,Digital,Audiobook,91,Harry Potter And The Sorcerer's Stone,9781781102633,J. K. Rowling,"Juvenile Fiction, Juvenile Literature",Pottermore,2012,2023-11-01,,J. K. Rowling
311075,Digital,Ebook,12,Harry Potter And The Sorcerer's Stone,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore,2012,2023-12-01,,J. K. Rowling
...,...,...,...,...,...,...,...,...,...,...,...,...
1001169,Digital,Ebook,21,Harry Potter And The Sorcerer's Stone,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore Publishing,2012,2025-7-01,,J. K. Rowling
1128312,Digital,Audiobook,98,Harry Potter And The Sorcerer's Stone,9781781102633,J. K. Rowling,Juvenile Fiction,Pottermore Publishing,2012,2025-8-01,,J. K. Rowling
1135628,Digital,Ebook,24,Harry Potter And The Sorcerer's Stone,9781781100486,J. K. Rowling,Juvenile Fiction,Pottermore Publishing,2012.0,2025-8-01,,J. K. Rowling
1079789,Digital,Audiobook,113,Harry Potter And The Sorcerer's Stone,9781781102633,J. K. Rowling,Juvenile Fiction,Pottermore Publishing,2012,2025-9-01,,J. K. Rowling


## Extract Creator names in `Firstname Lastname` format

In [22]:
juvenile_df['Creator'] = juvenile_df.Creator.str.strip(',')

In [23]:
lastnames = [re.search(r'^(.+?),', c).group(1) if str(c).find(',') != -1 else c for c in juvenile_df.Creator]

In [24]:
firstnames = [re.search(r'^(.+?),([A-Za-zÀ-ÿ\-\'\s.]+)', c).group(2) if str(c).find(',') != -1 else '' for c in juvenile_df.Creator]

In [56]:
fullnames = []
i = 0

for name in lastnames:
    if isinstance(name, float) or isinstance(firstnames[i], float):
        fullname = np.nan
    else:
        fullname = (str(firstnames[i]).strip() + ' ' + str(name).strip()).strip()
    fullnames.append(fullname)
    i += 1

In [57]:
juvenile_df['CreatorName'] = fullnames

#### Replace null CreatorNames  
These appear to be mostly compilations of various kinds, or things otherwise without a single creator.  Extracting editor/contributor names from the TitleNotes would be inconsistent, so I'll substitute the publisher to get a sense of similar origins.

In [59]:
juvenile_df['CreatorName'] = juvenile_df['CreatorName'].fillna(juvenile_df['Publisher'].str.extract(r'(^[\w\s]*)')[0])

## Extract dates 

#### Replace placeholders (`-`) with 0s, followed by `?` in incomplete years

In [None]:
incomplete_dates = juvenile_df.loc[(~juvenile_df.PublicationYear.isna()) & (juvenile_df.PublicationYear.str.contains('-')), 'PublicationYear'].str.extract(r'[\b\D](\d{2,3}\-{1,2})')

In [None]:
incomplete_dates = incomplete_dates.loc[~incomplete_dates[0].isna()]

In [None]:
incomplete_dates['new'] = incomplete_dates[0].str.replace('-', '0') + '?'

In [None]:
for i, r in incomplete_dates.iterrows():
    juvenile_df.loc[i, 'PublicationYear'] = juvenile_df.loc[i, 'PublicationYear'].replace(r[0], r['new']).replace('??', '?')

### Extract different types of PublicationYear values  

Examples of formatting varieties (from dataset FAQ): 
* `2005` — publication date
* `c. 2005` – copyright symbol
* `[2005]` – Printing date
* `p. 2005` – phonogram copyright symbol
* `2004, c. 2005` – publication and copyright date
* `2005-2007` – intervening years
* `[2005?]` – Approximate date

#### Copyright Dates
* Variants include: c. YYYY, C. YYYY, ©YYYY, © YYYY, cop. YYYY, c&p YYYY and similar
* Multiple dates may be listed, potentially in any order

In [None]:
juvenile_df['CopyrightList'] = juvenile_df.PublicationYear.str.findall(r'[©cC](?:op)?(?:&p)?\.?\s?(\d{4})')

In [None]:
juvenile_df['CopyrightList'] = juvenile_df.CopyrightList.fillna('').apply(list)

In [None]:
juvenile_df['CopyrightDate'] = [int(max(x)) if len(x) > 0 else np.nan for x in juvenile_df['CopyrightList']]

#### Phonogram Copyright Dates
* Variants include: p. YYYY, pYYYY, P.YYYY, c&p YYYY and similar
* Multiple dates could be listed, potentially in any order

In [None]:
juvenile_df['PCopyrightList'] = juvenile_df.PublicationYear.str.findall(r'\b(?:c&)?[pP]\.?\s?(\d{4})')

In [None]:
juvenile_df['PCopyrightList'] = juvenile_df.PCopyrightList.fillna('').apply(list)

In [None]:
juvenile_df['PCopyrightDate'] = [int(max(x)) if len(x) > 0 else np.nan for x in juvenile_df['PCopyrightList']]

#### Printing Dates
* Printing dates will always be enclosed in square brackets
* I do NOT want approximate dates (which are followed by question marks) or copyright dates

In [None]:
juvenile_df['PrintingDate'] = juvenile_df.PublicationYear.str.extract(r'\[.*(?<![©cCpP])[\.\s&opP]{0,5}(\d{4})(?!\?).*\]')

#### Approximate Dates
* Enclosed in brackets and followed by a question mark
* There may be a range of dates followed by a question mark `YYYY - YYYY?`  in which case I want to return the earliest (first) year of the range

In [None]:
juvenile_df['ApproxDate'] = juvenile_df.PublicationYear.str.extract(r'\[.*?(\d{4}).*\?\]')

#### Publication Dates
* Publication dates are noted by a lack of previous signifiers - they are NOT in brackets and NOT preceded by a copyright signifier

In [None]:
juvenile_df['PubDate'] = juvenile_df.PublicationYear.str.extract(r'(?<![©cCpP\[])[\.\s&opP]{0,5}(\d{4})(?!\s?[\?\]-])')

#### Other Dates
* Pull out any years that manage to avoid meeting any previous criteria

In [None]:
juvenile_df['OtherDateList'] = np.nan

In [None]:
juvenile_df['OtherDateList'] = np.where(juvenile_df.PubDate.isna() & juvenile_df.PrintingDate.isna() & juvenile_df.CopyrightDate.isna() & juvenile_df.PCopyrightDate.isna() & juvenile_df.ApproxDate.isna(), 
                                     juvenile_df.PublicationYear.str.findall(r'(\d{4})'),
                                     np.nan)

In [None]:
juvenile_df['OtherDateList'] = juvenile_df.OtherDateList.fillna('').apply(list)

In [None]:
juvenile_df['OtherDate'] = [int(max(x)) if len(x) > 0 else np.nan for x in juvenile_df['OtherDateList']]

### Consolidate to 'CreatedDate' for use in analysis

In [None]:
juvenile_df['CreatedDate'] = juvenile_df[['PubDate', 'PrintingDate', 'CopyrightDate', 'PCopyrightDate', 'ApproxDate', 'OtherDate']].bfill(axis = 1)['PubDate']

## Export cleaned data

In [None]:
juvenile_df.to_csv('../Data/juvenile_clean.csv')