In [2]:
# data source: books.csv

import pandas as pd

df = pd.read_csv('books.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8287 entries, 0 to 8286
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Identifier              8287 non-null   int64  
 1   Edition Statement       773 non-null    object 
 2   Place of Publication    8287 non-null   object 
 3   Date of Publication     8106 non-null   object 
 4   Publisher               4092 non-null   object 
 5   Title                   8287 non-null   object 
 6   Author                  6509 non-null   object 
 7   Contributors            8287 non-null   object 
 8   Corporate Author        0 non-null      float64
 9   Corporate Contributors  0 non-null      float64
 10  Former owner            1 non-null      object 
 11  Engraver                0 non-null      float64
 12  Issuance type           8287 non-null   object 
 13  Flickr URL              8287 non-null   object 
 14  Shelfmarks              8287 non-null   

# A. Filtering

In [3]:
# Filtering: drop columns Edition Statement, Corporate Author, Corporate Contributors, Former owner, Engraver, Issuance type, Shelfmarks
df1 = df.drop(columns = ["Edition Statement","Corporate Author",
                         "Corporate Contributors", "Former owner",
                         'Engraver', 'Issuance type', 'Shelfmarks'])
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8287 entries, 0 to 8286
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Identifier            8287 non-null   int64 
 1   Place of Publication  8287 non-null   object
 2   Date of Publication   8106 non-null   object
 3   Publisher             4092 non-null   object
 4   Title                 8287 non-null   object
 5   Author                6509 non-null   object
 6   Contributors          8287 non-null   object
 7   Flickr URL            8287 non-null   object
dtypes: int64(1), object(7)
memory usage: 518.1+ KB


In [4]:
# droping/exclude columns using usecols

df2 = pd.read_csv('books.csv', usecols= ["Identifier", "Place of Publication", 
                                         "Date of Publication",
                                        "Publisher","Title","Author","Contributors",
                                         "Flickr URL"])
df2.head(50)
# df2.info()


Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Flickr URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",http://www.flickr.com/photos/britishlibrary/ta...
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",http://www.flickr.com/photos/britishlibrary/ta...
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",http://www.flickr.com/photos/britishlibrary/ta...
5,481,London,1875,William Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",http://www.flickr.com/photos/britishlibrary/ta...
6,519,London,1872,The Author,Lagonells. By the author of Darmayne (F. E. A....,"A., F. E.","ASHLEY, Florence Emily.",http://www.flickr.com/photos/britishlibrary/ta...
7,667,"pp. 40. G. Bryan & Co: Oxford, 1898",,,"The Coming of Spring, and other poems. By J. A...","A., J.|A., J.","ANDREWS, J. - Writer of Verse",http://www.flickr.com/photos/britishlibrary/ta...
8,874,London],1676,,"A Warning to the inhabitants of England, and L...",Remaʿ.,"ADAMS, Mary.",http://www.flickr.com/photos/britishlibrary/ta...
9,1143,London,1679,,A Satyr against Vertue. (A poem: supposed to b...,"A., T.","OLDHAM, John.",http://www.flickr.com/photos/britishlibrary/ta...


# B. Tidying Up the Data

In [6]:
# Update data in the column 'Date of Publication' as given specification

df3 = df2["Date of Publication"].str.extract(r'^(\d{4})', expand=False)
df3



0       1879
1       1868
2       1869
3       1851
4       1857
        ... 
8282    1838
8283    1831
8284     NaN
8285    1834
8286    1834
Name: Date of Publication, Length: 8287, dtype: object

In [8]:
df3

0       1879
1       1868
2       1869
3       1851
4       1857
        ... 
8282    1838
8283    1831
8284     NaN
8285    1834
8286    1834
Name: Date of Publication, Length: 8287, dtype: object

In [11]:
# convert type for date of publication to numeric

df2['Date of Publication'] = pd.to_numeric(df3)
# df2['Date of Publication'].dtypes

dtype('float64')

# C. Tidying with applymap()

In [35]:
# data source: uniplaces.txt

# Change the data to have 3 columns containing the state, city, and university

import pandas as pd

df_uni = pd.read_csv("uniplaces.txt", header = None, sep = "\t") 




In [43]:
df_uni.columns = ["State"]
df_uni

Unnamed: 0,State
0,Alabama[edit]
1,Auburn (Auburn University)[1]
2,Florence (University of North Alabama)
3,Jacksonville (Jacksonville State University)[2]
4,Livingston (University of West Alabama)[2]
...,...
562,Stevens Point (University of Wisconsin–Stevens...
563,Waukesha (Carroll University)
564,Whitewater (University of Wisconsin–Whitewater...
565,Wyoming[edit]
