In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
juvenile_df = pd.read_csv('../Data/juvenile.csv', dtype = {5: str, 7: str}, header = None)

## Rename Columns

In [3]:
juvenile_df.columns = ['UsageClass', 'CheckoutType', 'MaterialType', 'CheckoutYear', 'CheckoutMonth', 'Checkouts', 'Title', 'ISBN', 'Creator', 'Subjects', 'Publisher', 'PublicationYear']

## Adjust capitalization and data types

In [4]:
juvenile_df['MaterialType'] = juvenile_df.MaterialType.str.title()

In [5]:
juvenile_df[['Title', 'TitleNotes']] = juvenile_df.Title.str.split(' / ', expand = True, n = 1)

In [6]:
juvenile_df['Title'] = [' '.join([w.capitalize() for w in x]) for x in juvenile_df.Title.str.split(' ')]

## Improvements To Do: capitalize first character after a (, use title case unless there's an apostrophe, keep all-caps items in all-caps?

In [7]:
juvenile_df['Checkouts'] = juvenile_df.Checkouts.str.replace(',', '').astype(int)

## Extract Creator names in `Firstname Lastname` format

In [8]:
##To Do - look into juvenile_df.loc[juvenile_df.Creator.isna()]

In [9]:
juvenile_df['Creator'] = juvenile_df.Creator.str.strip(',')

In [10]:
lastnames = [re.search(r'^(.+?),', c).group(1) if str(c).find(',') != -1 else c for c in juvenile_df.Creator]

In [11]:
firstnames = [re.search(r'^(.+?),([A-Za-zÀ-ÿ-\'\s.]+)', c).group(2) if str(c).find(',') != -1 else '' for c in juvenile_df.Creator]

In [12]:
fullnames = []
i = 0

for name in lastnames:
    fullname = (str(firstnames[i]).strip() + ' ' + str(name).strip()).strip()
    fullnames.append(fullname)
    i += 1

In [13]:
juvenile_df['CreatorName'] = fullnames

## Extract dates

#### Replace placeholders (`-`) with 0s, followed by `?` in incomplete years

In [159]:
incomplete_dates = juvenile_df.loc[(~juvenile_df.PublicationYear.isna()) & (juvenile_df.PublicationYear.str.contains('-')), 'PublicationYear'].str.extract(r'[\b\D](\d{2,3}\-{1,2})')

In [161]:
incomplete_dates = incomplete_dates.loc[~incomplete_dates[0].isna()]

In [168]:
incomplete_dates['new'] = incomplete_dates[0].str.replace('-', '0') + '?'

In [172]:
for i, r in incomplete_dates.iterrows():
    juvenile_df.loc[i, 'PublicationYear'] = juvenile_df.loc[i, 'PublicationYear'].replace(r[0], r['new']).replace('??', '?')

#### Extract different types of PublicationYear values  

Examples of formatting varieties (from dataset FAQ): 
* `2005` — publication date
* `c. 2005` – copyright symbol
* `[2005]` – Printing date
* `p. 2005` – phonogram copyright symbol
* `2004, c. 2005` – publication and copyright date
* `2005-2007` – intervening years
* `[2005?]` – Approximate date

In [183]:
## copyright dates
## TO DO - decide whether to use highest or lowest value when multiple are available
juvenile_df['CopyrightDate'] = juvenile_df.PublicationYear.str.findall(r'[©cC](?:&p)?\.?\s?(\d{4})')

In [184]:
## phonogram copyright dates
juvenile_df['PCopyrightDate'] = juvenile_df.PublicationYear.str.findall(r'\b(?:c&)?[©pP]\.?\s?(\d{4})')

In [185]:
## printing dates
## TO DO - account for possible spaces or periods after the copyright indicator - lookbehind assertions need to be fixed width :/
juvenile_df['PrintingDate'] = juvenile_df.PublicationYear.str.extract(r'\[.*(?<![©cCpP])(\d{4})(?!\?).*\]')

In [186]:
## approximate date
juvenile_df['ApproxDate'] = juvenile_df.PublicationYear.str.extract(r'\[(\d{4})\?\]')

for 'CreationDate', I will use the following, in order of availability: 
* publication date
* printing date
* copyright date
* approximate date

In [None]:
##TODO: 
    ## replace empty CopyrightDate & PCopyrightDate values with nulls
    ## decide whether to keep earliest or latest copyright dates
    ## create juvenile_df.OtherDate = iff all other date columns are null, extract 4-digit from PublicationYear
    ## create juvenile_df.PublicationDate = coalesce OtherDate, PrintingDate, CopyrightDate, PCopyrightDate

In [191]:
juvenile_df

Unnamed: 0,UsageClass,CheckoutType,MaterialType,CheckoutYear,CheckoutMonth,Checkouts,Title,ISBN,Creator,Subjects,Publisher,PublicationYear,TitleNotes,CreatorName,CopyrightDate,PCopyrightDate,PrintingDate,ApproxDate
0,Physical,Horizon,Book,2023,10,1,Don't Hate The Player,"1547605022, 9781547605026, 9781547605033","Nedd, Alexis","eSports Contests Fiction, High schools Fiction...","Bloomsbury YA,",2021.,Alexis Nedd.,Alexis Nedd,[],[],,
1,Physical,Horizon,Book,2023,10,1,Thanksgiving Graces,"0824956346, 9780824956349","Moulton, Mark Kimball","Stories in rhyme Juvenile fiction, Thanksgivin...","Ideals Children's Books,",[2011],written by Mark Kimball Moulton ; illustrated ...,Mark Kimball Moulton,[],[],2011,
2,Physical,Horizon,Book,2023,10,1,The Elephant Thief,"1338188437, 9781338188431","Kerr, Jane (Children's literature author)","Pickpockets Juvenile fiction, Zoo keepers Juve...","Chicken House, Scholastic Inc.,",2018.,Jane Kerr.,Jane Kerr,[],[],,
3,Physical,Horizon,Book,2023,10,1,Elsie's Bird,"0399252924, 9780399252921","Yolen, Jane","Prairies Juvenile fiction, Frontier and pionee...","Philomel Books,",[2010],Jane Yolen ; illustrated by David Small.,Jane Yolen,[],[],2010,
4,Physical,Horizon,Book,2023,10,6,The Ogre In The Hall,"1662640528, 9781662640520","Sorin, Céline","Monsters Juvenile fiction, Fear of the dark Ju...","Hippo Park, an imprint of Astra Publishing House,",[2023],story by Céline Sorin ; pictures by Pascal Lam...,Céline Sorin,[],[],2023,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6443523,Digital,OverDrive,Audiobook,2023,10,1,"You're A Bad Man, Mr. Gum!: Mr Gum Series, Boo...",9780061629488,Andy Stanton,Juvenile Fiction,HarperCollins Publishers Inc.,2008,,Andy Stanton,[],[],,
6443524,Physical,Horizon,Book,2023,10,5,I Feel! : A Book Of Emotions,"0358621240, 9780358621249","Medina, Juana, 1980-","Emotions Juvenile fiction, Emotions Fiction, P...","Versify, an imprint of HarperCollinsPublishers,",[2022],by Juana Medina.,Juana Medina,[],[],2022,
6443525,Physical,Horizon,Book,2023,10,3,The Big Bath House,"0593181956, 0593181964, 9780593181959, 9780593...","Maclear, Kyo, 1970-","Bathhouses Juvenile fiction, Families Japan Ju...","Random House Studio,",[2021],written by Kyo Maclear ; illustrated by Gracey...,Kyo Maclear,[],[],2021,
6443526,Digital,OverDrive,Ebook,2023,10,1,Pablo Picasso,9780711286252,Maria Isabel Sanchez Vegara,"Art, Biography & Autobiography, Juvenile Nonfi...",Quarto Publishing Group USA,2023,,Maria Isabel Sanchez Vegara,[],[],,
