In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
juvenile_df = pd.read_csv('../Data/juvenile.csv', dtype = {5: str, 7: str}, header = None)

## Rename Columns

In [4]:
juvenile_df.columns = ['UsageClass', 'CheckoutType', 'MaterialType', 'CheckoutYear', 'CheckoutMonth', 'Checkouts', 'Title', 'ISBN', 'Creator', 'Subjects', 'Publisher', 'PublicationYear']

## Adjust data types and minor formatting changes

In [6]:
juvenile_df['MaterialType'] = juvenile_df.MaterialType.str.title()

In [7]:
juvenile_df['Checkouts'] = juvenile_df.Checkouts.str.replace(',', '').astype(int)

In [8]:
juvenile_df['CheckoutDate'] = juvenile_df['CheckoutYear'].astype(str) + '-' + juvenile_df['CheckoutMonth'].astype(str) + '-01'

In [9]:
juvenile_df = juvenile_df.drop(['CheckoutType', 'CheckoutYear', 'CheckoutMonth'], axis = 1)

## Clean & standardize titles

#### Split title into 'Title' and 'TitleNotes'
* Delimiters: '` / `' & '` [by] `'

In [10]:
titleSplit1 = juvenile_df.Title.str.split(' / ', expand = True, n = 1).rename(columns = {0: 'Title1', 1: 'TitleNotes1'})

In [11]:
titleSplit2 = juvenile_df.Title.str.split(' / ', expand = True, n = 1)[0].str.split(' [by] ', regex = False, expand = True, n = 1).rename(columns = {0: 'Title2', 1: 'TitleNotes2'})

In [12]:
clean_titles = pd.merge(titleSplit1, titleSplit2, left_index = True, right_index = True)

In [13]:
clean_titles['TitleNotes'] = clean_titles['TitleNotes1'].astype(str) + clean_titles['TitleNotes2'].astype(str).replace('None', '')

In [14]:
juvenile_df[['Title', 'TitleNotes']] = clean_titles[['Title2', 'TitleNotes']]

#### Capitalize titles
* Capitalize first letter of each word in a title
  * NOT just title case, which capitalizes the first letter after apostrophes
    * `[' '.join([w.capitalize() for w in x]) for x in juvenile_df.Title.str.split(' ')]` could handle that, but didn't do the following
  * Make sure first character after a ( is capitalized
  * Keep all-caps words in all-caps

In [15]:
titles = []
for title in juvenile_df.Title:
    prev_c = ''
    cap_title = []
    for c in title:
        if prev_c in [' ', '[', '(']:
            cap_title.append(c.upper())
        else:
            cap_title.append(c)
        prev_c = c
    cap_title = ''.join(cap_title)
    titles.append(cap_title)

In [16]:
juvenile_df['Title'] = titles

In [17]:
juvenile_df.loc[juvenile_df.Title.str.lower().str.contains(r'^a wrinkle in time.*'), 'Title'].unique()

array(['A Wrinkle In Time', 'A Wrinkle In Time (Unabridged)',
       'A Wrinkle In Time, The Graphic Novel',
       'A Wrinkle In Time: A Wrinkle In Time Quintet Series, Book 1',
       'A Wrinkle In Time:  (Unabridged)',
       'A Wrinkle In Time, The Graphic Novel: Time Quintet, Book 1',
       'A Wrinkle In Time: Time Quintet, Book 1',
       'A Wrinkle In Time: Time Quartet, Book 1 (Unabridged) (Unabridged)',
       'A Wrinkle In Time [Text (Large Print)]',
       'A Wrinkle In Time [Sound Recording]',
       'A Wrinkle In Time: Time Quartet, Book 1 (Unabridged)'],
      dtype=object)

#### Remove extra comments
* `(Unabridged)`, `[Sound Recording]`, `[Videorecording]`, and `[Text (Large Print)]`  
Unabridged is assumed unless otherwise stated, and the format notes are captured by MaterialType

In [21]:
juvenile_df['Title'] = juvenile_df['Title'].str.replace('(Unabridged)', '').str.replace('[Sound Recording]', '').str.replace('[Videorecording]', '').str.replace('[Text (Large Print)]', '').str.strip().str.strip(':')

#### Individual Series Fixes

In [30]:
## A wrinkle in time:
juvenile_df['Title'] = juvenile_df['Title'].replace(r'\:\s(?:A\sWrinkle\sIn\s)?Time\sQu[ia][nr]tet(?:\sSeries)?,\sBook\s\d','', regex = True)

In [71]:
## Harry Potter
juvenile_df['Title'] = juvenile_df['Title'].replace(r'\:\sHarry\sPotter\sSeries,\sBook\s\d','', regex = True)
juvenile_df['Title'] = juvenile_df['Title'].str.replace('Half-blood Prince', 'Half-Blood Prince')

In [130]:
## Diary of a Wimpy Kid
juvenile_df['Title'] = juvenile_df['Title'].replace(r'Diary\sOf\sA\sWimpy\sKid[\.\:]\s', '', regex = True
                                      ).replace(r'\s\(Diary\sOf\sA\sWimpy\sKid\sBook\s\d+\)', '', regex = True
                                      ).replace(r'\:\sDiary\sOf\sA\sWimpy\sKid\sSeries,\sBook\d+', '', regex = True)

In [72]:
## correct for instances like 'J.K. Rowling' vs 'J. K. Rowling' - add space after all full stops
juvenile_df['Creator'] = juvenile_df['Creator'].replace(r'\.(?=\S)', '. ', regex = True)

In [131]:
## Fix titles that duplicate
juvenile_df['SingleTitle'] = juvenile_df['Title'].str.extract(r'(.+)\W\s\1')
juvenile_df['Title'] = np.where(juvenile_df['SingleTitle'].isna(), juvenile_df['Title'], juvenile_df['SingleTitle'])
juvenile_df = juvenile_df.drop(columns = ['SingleTitle'])

## Extract Creator names in `Firstname Lastname` format

In [133]:
juvenile_df['Creator'] = juvenile_df.Creator.str.strip(',')

In [134]:
lastnames = [re.search(r'^(.+?),', c).group(1) if str(c).find(',') != -1 else c for c in juvenile_df.Creator]

In [135]:
firstnames = [re.search(r'^(.+?),([A-Za-zÀ-ÿ\-\'\s.]+)', c).group(2) if str(c).find(',') != -1 else '' for c in juvenile_df.Creator]

In [136]:
fullnames = []
i = 0

for name in lastnames:
    if isinstance(name, float) or isinstance(firstnames[i], float):
        fullname = np.nan
    else:
        fullname = (str(firstnames[i]).strip() + ' ' + str(name).strip()).strip()
    fullnames.append(fullname)
    i += 1

In [137]:
juvenile_df['CreatorName'] = fullnames

#### Replace null CreatorNames  
These appear to be mostly compilations of various kinds, or things otherwise without a single creator.  Extracting editor/contributor names from the TitleNotes would be inconsistent, so I'll substitute the publisher to get a sense of similar origins.

In [138]:
juvenile_df['CreatorName'] = juvenile_df['CreatorName'].fillna(juvenile_df['Publisher'].str.extract(r'(^[\w\s]*)')[0])

## Extract dates 

#### Replace placeholders (`-`) with 0s, followed by `?` in incomplete years

In [139]:
incomplete_dates = juvenile_df.loc[(~juvenile_df.PublicationYear.isna()) & (juvenile_df.PublicationYear.str.contains('-')), 'PublicationYear'].str.extract(r'[\b\D](\d{2,3}\-{1,2})')

In [140]:
incomplete_dates = incomplete_dates.loc[~incomplete_dates[0].isna()]

In [141]:
incomplete_dates['new'] = incomplete_dates[0].str.replace('-', '0') + '?'

In [142]:
for i, r in incomplete_dates.iterrows():
    juvenile_df.loc[i, 'PublicationYear'] = juvenile_df.loc[i, 'PublicationYear'].replace(r[0], r['new']).replace('??', '?')

### Extract different types of PublicationYear values  

Examples of formatting varieties (from dataset FAQ): 
* `2005` — publication date
* `c. 2005` – copyright symbol
* `[2005]` – Printing date
* `p. 2005` – phonogram copyright symbol
* `2004, c. 2005` – publication and copyright date
* `2005-2007` – intervening years
* `[2005?]` – Approximate date

#### Copyright Dates
* Variants include: c. YYYY, C. YYYY, ©YYYY, © YYYY, cop. YYYY, c&p YYYY and similar
* Multiple dates may be listed, potentially in any order

In [143]:
juvenile_df['CopyrightList'] = juvenile_df.PublicationYear.str.findall(r'[©cC](?:op)?(?:&p)?\.?\s?(\d{4})')

In [144]:
juvenile_df['CopyrightList'] = juvenile_df.CopyrightList.fillna('').apply(list)

In [145]:
juvenile_df['CopyrightDate'] = [int(max(x)) if len(x) > 0 else np.nan for x in juvenile_df['CopyrightList']]

#### Phonogram Copyright Dates
* Variants include: p. YYYY, pYYYY, P.YYYY, c&p YYYY and similar
* Multiple dates could be listed, potentially in any order

In [146]:
juvenile_df['PCopyrightList'] = juvenile_df.PublicationYear.str.findall(r'\b(?:c&)?[pP]\.?\s?(\d{4})')

In [147]:
juvenile_df['PCopyrightList'] = juvenile_df.PCopyrightList.fillna('').apply(list)

In [148]:
juvenile_df['PCopyrightDate'] = [int(max(x)) if len(x) > 0 else np.nan for x in juvenile_df['PCopyrightList']]

#### Printing Dates
* Printing dates will always be enclosed in square brackets
* I do NOT want approximate dates (which are followed by question marks) or copyright dates

In [149]:
juvenile_df['PrintingDate'] = juvenile_df.PublicationYear.str.extract(r'\[.*(?<![©cCpP])[\.\s&opP]{0,5}(\d{4})(?!\?).*\]')

#### Approximate Dates
* Enclosed in brackets and followed by a question mark
* There may be a range of dates followed by a question mark `YYYY - YYYY?`  in which case I want to return the earliest (first) year of the range

In [150]:
juvenile_df['ApproxDate'] = juvenile_df.PublicationYear.str.extract(r'\[.*?(\d{4}).*\?\]')

#### Publication Dates
* Publication dates are noted by a lack of previous signifiers - they are NOT in brackets and NOT preceded by a copyright signifier

In [151]:
juvenile_df['PubDate'] = juvenile_df.PublicationYear.str.extract(r'(?<![©cCpP\[])[\.\s&opP]{0,5}(\d{4})(?!\s?[\?\]-])')

#### Other Dates
* Pull out any years that manage to avoid meeting any previous criteria

In [152]:
juvenile_df['OtherDateList'] = np.nan

In [153]:
juvenile_df['OtherDateList'] = np.where(juvenile_df.PubDate.isna() & juvenile_df.PrintingDate.isna() & juvenile_df.CopyrightDate.isna() & juvenile_df.PCopyrightDate.isna() & juvenile_df.ApproxDate.isna(), 
                                     juvenile_df.PublicationYear.str.findall(r'(\d{4})'),
                                     np.nan)

In [154]:
juvenile_df['OtherDateList'] = juvenile_df.OtherDateList.fillna('').apply(list)

In [155]:
juvenile_df['OtherDate'] = [int(max(x)) if len(x) > 0 else np.nan for x in juvenile_df['OtherDateList']]

### Consolidate to 'CreatedDate' for use in analysis

In [156]:
juvenile_df['CreatedDate'] = juvenile_df[['PubDate', 'PrintingDate', 'CopyrightDate', 'PCopyrightDate', 'ApproxDate', 'OtherDate']].bfill(axis = 1)['PubDate']

  juvenile_df['CreatedDate'] = juvenile_df[['PubDate', 'PrintingDate', 'CopyrightDate', 'PCopyrightDate', 'ApproxDate', 'OtherDate']].bfill(axis = 1)['PubDate']


## Group MaterialTypes

In [157]:
phys_books = ['Book', 'Regprint', 'Largeprint', 'Regprint, Sounddisc']
ebooks = ['Ebook']
audiobooks = ['Audiobook', 'Er', 'Er, Soundrec', 'Er, Sounddisc', 'Sounddisc', 'Soundrec', 'Soundcass', 'Er, Sounddisc, Soundrec']
music = ['Musicsndrec', 'Music']
videos = ['Visual', 'Video', 'Videocass', 'Er, Videodisc', 'Videodisc']
other = ['Kit', 'Unspecified', 'Sounddisc, Videodisc', 'Mixed', 'Regprint, Videorec', 'Flashcard, Sounddisc']

In [158]:
juvenile_df['MaterialCategory'] = np.where(juvenile_df.MaterialType.isin(phys_books), 'Book',
                                             np.where(juvenile_df.MaterialType.isin(ebooks), 'E-book',
                                                      np.where(juvenile_df.MaterialType.isin(audiobooks), 'Audiobook',
                                                               np.where(juvenile_df.MaterialType.isin(music), 'Music',
                                                                        np.where(juvenile_df.MaterialType.isin(videos), 'Video',
                                                                                 np.where(juvenile_df.MaterialType.isin(other), 'Other', np.nan))))))

## Export cleaned data

In [159]:
juvenile_df.to_csv('../Data/juvenile_clean.csv')