In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
juvenile_df = pd.read_csv('../Data/juvenile.csv', dtype = {5: str, 7: str}, header = None)

## Rename Columns

In [3]:
juvenile_df.columns = ['UsageClass', 'CheckoutType', 'MaterialType', 'CheckoutYear', 'CheckoutMonth', 'Checkouts', 'Title', 'ISBN', 'Creator', 'Subjects', 'Publisher', 'PublicationYear']

## Adjust capitalization and data types

In [4]:
juvenile_df['MaterialType'] = juvenile_df.MaterialType.str.title()

In [5]:
juvenile_df[['Title', 'TitleNotes']] = juvenile_df.Title.str.split(' / ', expand = True, n = 1)

In [6]:
juvenile_df['Title'] = [' '.join([w.capitalize() for w in x]) for x in juvenile_df.Title.str.split(' ')]

## To Do: capitalize first character after a (, use title case unless there's an apostrophe, keep all-caps words in all-caps?

In [7]:
juvenile_df['Checkouts'] = juvenile_df.Checkouts.str.replace(',', '').astype(int)

## Extract Creator names in `Firstname Lastname` format

In [8]:
##To Do - look into juvenile_df.loc[juvenile_df.Creator.isna()]

In [9]:
juvenile_df['Creator'] = juvenile_df.Creator.str.strip(',')

In [10]:
lastnames = [re.search(r'^(.+?),', c).group(1) if str(c).find(',') != -1 else c for c in juvenile_df.Creator]

In [28]:
firstnames = [re.search(r'^(.+?),([A-Za-zÀ-ÿ\-\'\s.]+)', c).group(2) if str(c).find(',') != -1 else '' for c in juvenile_df.Creator]

In [29]:
fullnames = []
i = 0

for name in lastnames:
    fullname = (str(firstnames[i]).strip() + ' ' + str(name).strip()).strip()
    fullnames.append(fullname)
    i += 1

In [13]:
juvenile_df['CreatorName'] = fullnames

## Extract dates

#### Replace placeholders (`-`) with 0s, followed by `?` in incomplete years

In [14]:
incomplete_dates = juvenile_df.loc[(~juvenile_df.PublicationYear.isna()) & (juvenile_df.PublicationYear.str.contains('-')), 'PublicationYear'].str.extract(r'[\b\D](\d{2,3}\-{1,2})')

In [15]:
incomplete_dates = incomplete_dates.loc[~incomplete_dates[0].isna()]

In [16]:
incomplete_dates['new'] = incomplete_dates[0].str.replace('-', '0') + '?'

In [17]:
for i, r in incomplete_dates.iterrows():
    juvenile_df.loc[i, 'PublicationYear'] = juvenile_df.loc[i, 'PublicationYear'].replace(r[0], r['new']).replace('??', '?')

#### Extract different types of PublicationYear values  

Examples of formatting varieties (from dataset FAQ): 
* `2005` — publication date
* `c. 2005` – copyright symbol
* `[2005]` – Printing date
* `p. 2005` – phonogram copyright symbol
* `2004, c. 2005` – publication and copyright date
* `2005-2007` – intervening years
* `[2005?]` – Approximate date

In [56]:
## copyright dates
juvenile_df['CopyrightDate'] = juvenile_df.PublicationYear.str.findall(r'[©cC](?:op)?(?:&p)?\.?\s?(\d{4})')

In [19]:
## phonogram copyright dates
juvenile_df['PCopyrightDate'] = juvenile_df.PublicationYear.str.findall(r'\b(?:c&)?[pP]\.?\s?(\d{4})')

In [20]:
## printing dates
## TO DO - account for possible spaces or periods after the copyright indicator - lookbehind assertions need to be fixed width :/
juvenile_df['PrintingDate'] = juvenile_df.PublicationYear.str.extract(r'\[.*(?<![©cCpP])(\d{4})(?!\?).*\]')

In [21]:
## approximate date
juvenile_df['ApproxDate'] = juvenile_df.PublicationYear.str.extract(r'\[(\d{4})\?\]')

In [45]:
PubYears = juvenile_df.PublicationYear.value_counts().reset_index()

In [48]:
PubYears['len'] = PubYears.PublicationYear.str.len()

In [53]:
PubYearsSort = PubYears.sort_values('len').tail(20)

In [54]:
PubYearsSort

Unnamed: 0,PublicationYear,count,len
1610,"DL 2015, cop. 2015.",15,19
1341,"DL 2016, cop. 2016.",37,19
542,"[2004], c1962-c1963.",145,20
1512,[between 2000-2009?],23,20
2023,[between 1960-1969?],1,20
1657,/[Minguo] 103 [2014],12,20
948,"2000 printing, c1999.",74,21
1636,/[Minguo] 103 [2014].,13,21
1247,Childrens books c2000.,44,22
1404,2005 (second printing),31,22


In [55]:
## publication date
## find dates that aren't followed by ] - or ?, and aren't preceded by [ or a copyright signifier
PubYearsSort.PublicationYear.str.extract(r'(\d{4})(?!\s?[\?\]-])')

Unnamed: 0,0
1610,2015.0
1341,2016.0
542,1963.0
1512,
2023,
1657,
948,2000.0
1636,
1247,2000.0
1404,2005.0


Copyright signifiers:
* [©cCpP]\.?\s?
* [cC]op\.?\s?
* [cC]&[pP]

In [22]:
##TODO: 
    ## replace empty CopyrightDate & PCopyrightDate values with nulls
    ## decide whether to keep earliest or latest copyright dates
    ## create juvenile_df.OtherDate = iff all other date columns are null, extract 4-digit from PublicationYear
    ## create juvenile_df.CreationDate = coalesce PubDate, PrintingDate, CopyrightDate, PCopyrightDate, OtherDate

In [23]:
np.where(juvenile_df.CopyrightDate.str.len() == 0, np.nan, 
         np.where(juvenile_df.CopyrightDate.str.len() > 1, juvenile_df.CopyrightDate, 
                  juvenile_df.CopyrightDate))

array([nan, nan, nan, ..., nan, nan, nan], dtype=object)

In [24]:
[max(x) for x in juvenile_df.loc[juvenile_df.CopyrightDate.str.len() > 1, 'CopyrightDate']]

['1923',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '1923',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2006',
 '2007',
 '2006',
 '2010',
 '1963',
 '1985',
 '1979',
 '2009',
 '2002',
 '2000',
 '1994',
 '1997',
 '2000',
 '1995',
 '2009',
 '1985',
 '2002',
 '2010',
 '2004',
 '1979',
 '1975',
 '1985',
 '1997',
 '1963',
 '1994',
 '2010',
 '1985',
 '1997',
 '2002',
 '1979',
 '2009',
 '1975',
 '1995',
 '2000',
 '1994',
 '1963',
 '2009',
 '2000',
 '1997',
 '2009',
 '1975',
 '2002',
 '1995',
 '1979',
 '1985',
 '1963',
 '2010',
 '1994',
 '2002',
 '2000',
 '2002',
 '2000',
 '1994',
 '1963',
 '2002',
 '1979',
 '2000',
 '1994',
 '1963',
 '2000',
 '1923',
 '2002',
 '1979',
 '1994',
 '1979',
 '2000',
 '2002',
 '1963',
 '1923',
 '2000',
 '1979',
 '2002',
 '2002',
 '1963',
 '1994',
 '1979',
 '2000',
 '2002',
 '1963',
 '1994',
 '2000',
 '1963',
 '1979',
 '2002',
 '1963',
 