Importing required Libraries

In [30]:
import numpy as np
import pandas as pd 

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import os
print(os.listdir("../input"))

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


['Library Collection Inventory FAQs.pdf', 'library-collection-inventory.csv', 'socrata_metadata.json']


Readying the data for analysis (as per Kaggle bot Introductory kernel)

In [31]:
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [32]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [33]:
#Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

Readying in the data from the csv file

In [34]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# library-collection-inventory.csv has 26817320 rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('../input/library-collection-inventory.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'library-collection-inventory.csv'
nRow, nCol = df1.shape

First five rows of the dataset

In [35]:
df1.head()

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,Publisher,Subjects,ItemType,ItemCollection,FloatingItem,ItemLocation,ReportDate,ItemCount
0,3011076,A tale of two friends / adapted by Ellie O'Rya...,"O'Ryan, Ellie","1481425730, 1481425749, 9781481425735, 9781481...",2014.,"Simon Spotlight,","Musicians Fiction, Bullfighters Fiction, Best ...",jcbk,ncrdr,Floating,qna,2017-09-01T00:00:00.000,1
1,2248846,"Naruto. Vol. 1, Uzumaki Naruto / story and art...","Kishimoto, Masashi, 1974-",1569319006,"2003, c1999.","Viz,","Ninja Japan Comic books strips etc, Comic book...",acbk,nycomic,,lcy,2017-09-01T00:00:00.000,1
2,3209270,"Peace, love & Wi-Fi : a ZITS treasury / by Jer...","Scott, Jerry, 1955-","144945867X, 9781449458676",2014.,"Andrews McMeel Publishing,",Duncan Jeremy Fictitious character Comic books...,acbk,nycomic,,bea,2017-09-01T00:00:00.000,1
3,1907265,The Paris pilgrims : a novel / Clancy Carlile.,"Carlile, Clancy, 1930-",0786706155,c1999.,"Carroll & Graf,","Hemingway Ernest 1899 1961 Fiction, Biographic...",acbk,cafic,,cen,2017-09-01T00:00:00.000,1
4,1644616,"Erotic by nature : a celebration of life, of l...",,094020813X,"1991, c1988.","Red Alder Books/Down There Press,","Erotic literature American, American literatur...",acbk,canf,,cen,2017-09-01T00:00:00.000,1


Checking missing values

In [36]:
df1.isnull().sum()

BibNum               0
Title               12
Author             175
ISBN               233
PublicationYear     20
Publisher           20
Subjects            40
ItemType             0
ItemCollection       0
FloatingItem       850
ItemLocation         0
ReportDate           0
ItemCount            0
dtype: int64

Looking at the missing vales in detail

In [37]:
row = df1.isnull().any(axis = 1)
column = df1.isnull().any(axis = 0)
df1.loc[row,column]

Unnamed: 0,Title,Author,ISBN,PublicationYear,Publisher,Subjects,FloatingItem
1,"Naruto. Vol. 1, Uzumaki Naruto / story and art...","Kishimoto, Masashi, 1974-",1569319006,"2003, c1999.","Viz,","Ninja Japan Comic books strips etc, Comic book...",
2,"Peace, love & Wi-Fi : a ZITS treasury / by Jer...","Scott, Jerry, 1955-","144945867X, 9781449458676",2014.,"Andrews McMeel Publishing,",Duncan Jeremy Fictitious character Comic books...,
3,The Paris pilgrims : a novel / Clancy Carlile.,"Carlile, Clancy, 1930-",0786706155,c1999.,"Carroll & Graf,","Hemingway Ernest 1899 1961 Fiction, Biographic...",
4,"Erotic by nature : a celebration of life, of l...",,094020813X,"1991, c1988.","Red Alder Books/Down There Press,","Erotic literature American, American literatur...",
5,Children of Cambodia's killing fields : memoir...,,"0300068395, 0300078730",c1997.,"Yale University Press,","Political atrocities Cambodia, Children Cambod...",
6,Anti-Zionism : analytical reflections / editor...,,091559773X,c1989.,"Amana Books,","Berger Elmer 1908 1996, Zionism Controversial ...",
7,Hard-hearted Highlander / Julia London.,"London, Julia","0373789998, 037380394X, 9780373789993, 9780373...",[2017],"HQN,","Man woman relationships Fiction, Betrothal Fic...",
8,The Sandcastle Empire / Kayla Olson.,"Olson, Kayla","0062484877, 9780062484871",2017.,"HarperTeen,","Survival Juvenile fiction, Islands Juvenile fi...",
9,Doctor Who. The return of Doctor Mysterio / BB...,,,[2017],"BBC Worldwide,","Doctor Fictitious character Drama, Time travel...",Floating
10,Burnt toast makes you sing good : a memoir of ...,"Flinn, Kathleen","067001544X, 9780670015443",2014.,"Viking,","Cooking American Midwestern style, Flinn Kathl...",


Entries with missing Title of the Book

In [38]:
df1[df1['Title'].isnull()].index.tolist()

[64, 68, 97, 129, 149, 241, 466, 628, 787, 798, 902, 981]

Dropping all the 12 entries with missing title

In [39]:
df1.drop(index = [64, 68, 97, 129, 149, 241, 466, 628, 787, 798, 902, 981], inplace = True)

Imputing missing values for the remaining columns

In [40]:

df1['FloatingItem'].fillna('Not Applicable', inplace = True)

df1['Publisher'].fillna('Unknown', inplace = True)
df1['Author'].fillna('Unknown', inplace = True)
df1['Subjects'].fillna('Unknown', inplace = True)

df1['ISBN'].fillna(0, inplace = True)
df1['PublicationYear'].fillna(0, inplace = True)


Checking for missing values after imputation

In [41]:
df1.isnull().sum()

BibNum             0
Title              0
Author             0
ISBN               0
PublicationYear    0
Publisher          0
Subjects           0
ItemType           0
ItemCollection     0
FloatingItem       0
ItemLocation       0
ReportDate         0
ItemCount          0
dtype: int64

Unique item types, item collections and Reporting Dates

In [42]:
print('Item collections : {}'.format(df1['ItemCollection'].nunique()))
print('Item types : {}'.format(df1['ItemType'].nunique()))
print('Report Dates(unique) : {}'.format(df1['ReportDate'].nunique()))

Item collections : 95
Item types : 10
Report Dates(unique) : 1


All the data has been reported on a single date

**Analyzing data by Publishing date**

Slicing out the required data for 2014

In [43]:
df_2014 = df1.ix[df1['PublicationYear'] == '2014.',[0,1,5]] 
df_2014

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,BibNum,Title,Publisher
0,3011076,A tale of two friends / adapted by Ellie O'Rya...,"Simon Spotlight,"
2,3209270,"Peace, love & Wi-Fi : a ZITS treasury / by Jer...","Andrews McMeel Publishing,"
10,3018388,Burnt toast makes you sing good : a memoir of ...,"Viking,"
34,2963138,Promise land : my journey through America's se...,"Simon & Schuster,"
57,2987954,The Romanov sisters : the lost lives of the da...,"St. Martin's Press,"
102,3032381,The forks over knives plan : how to transition...,"Simon & Schuster,"
105,2973939,Kicking the sky : a novel / Anthony De Sa.,"Algonquin Books of Chapel Hill,"
116,3044157,Gluten-free artisan bread in five minutes a da...,"Thomas Dunne Books/St. Martin's Press,"
134,3056858,Strangers / Bill Pronzini.,"Thorndike Press, a part of Gale, Cengage Learn..."
303,2953046,Aunt Dimity and the wishing well / Nancy Ather...,"Viking,"


Books published in 2014 and no. of distinct publishers in the same year

In [44]:
print('Books Published in 2014 : {}'.format(df_2014['BibNum'].count()))
print('Unique publishers of books in 2014 : {}'.format(df_2014['Publisher'].count()))

Books Published in 2014 : 33
Unique publishers of books in 2014 : 33


All the books have been published by different publishers

Slicing the data for the year 2016

In [45]:
df_2016 = df1.ix[df1['PublicationYear'] == '2016.',[0,1,5]] 
df_2016

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,BibNum,Title,Publisher
16,3167678,The Alchemists' Council / Cynthea Masson.,"ECW Press,"
18,3165211,The aromatherapy garden : growing fragrant pla...,"Timber Press,"
20,3199800,Naked '76 / Kevin Brooks.,"Carolrhoda Labs,"
44,3259688,The English daughter / Maggie Wadey.,"Sandstone Press Ltd,"
50,3160162,Finding luck / by Kristin Earhart ; illustrate...,"Aladdin,"
59,3167001,Melba's American comfort : 100 recipes from my...,"Atria Books,"
70,3220551,"Da da da! Die di chao ren / wen/tu, Gongxi Day...","Xiao lu wen hua,"
182,3146689,A gathering of shadows / V.E. Schwab.,"Tor,"
185,3170881,Same but different : teen life on the autism e...,"Scholastic Press,"
187,3199176,Let the circle be unbroken / Mildred D. Taylor.,"Puffin Books,"


In [46]:
print('Books Published in 2016 : {}'.format(df_2016['BibNum'].count()))
print('Unique publishers of books in 2016 : {}'.format(df_2016['Publisher'].count()))

Books Published in 2016 : 47
Unique publishers of books in 2016 : 47


**Analyzing the data by item types**

In [47]:
df1['ItemType'].value_counts()

acbk     464
jcbk     258
arbk      88
accd      79
acdvd     63
jcdvd     20
acmus     11
jccd       3
acmap      1
armus      1
Name: ItemType, dtype: int64

ItemType is a code from the catalog record that describes the type of item. Some of the more common codes are: acbk (adult book), acdvd (adult DVD), jcbk (children's book), accd (adult CD) 

Slicing out the data for the adult book item from the item type column

In [48]:
adult_book = df1.ix[df1['ItemType'] == 'acbk',[1,2,4,5,6,8]]
adult_book

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Title,Author,PublicationYear,Publisher,Subjects,ItemCollection
1,"Naruto. Vol. 1, Uzumaki Naruto / story and art...","Kishimoto, Masashi, 1974-","2003, c1999.","Viz,","Ninja Japan Comic books strips etc, Comic book...",nycomic
2,"Peace, love & Wi-Fi : a ZITS treasury / by Jer...","Scott, Jerry, 1955-",2014.,"Andrews McMeel Publishing,",Duncan Jeremy Fictitious character Comic books...,nycomic
3,The Paris pilgrims : a novel / Clancy Carlile.,"Carlile, Clancy, 1930-",c1999.,"Carroll & Graf,","Hemingway Ernest 1899 1961 Fiction, Biographic...",cafic
4,"Erotic by nature : a celebration of life, of l...",Unknown,"1991, c1988.","Red Alder Books/Down There Press,","Erotic literature American, American literatur...",canf
5,Children of Cambodia's killing fields : memoir...,Unknown,c1997.,"Yale University Press,","Political atrocities Cambodia, Children Cambod...",canf
6,Anti-Zionism : analytical reflections / editor...,Unknown,c1989.,"Amana Books,","Berger Elmer 1908 1996, Zionism Controversial ...",canf
7,Hard-hearted Highlander / Julia London.,"London, Julia",[2017],"HQN,","Man woman relationships Fiction, Betrothal Fic...",nanew
8,The Sandcastle Empire / Kayla Olson.,"Olson, Kayla",2017.,"HarperTeen,","Survival Juvenile fiction, Islands Juvenile fi...",nynew
10,Burnt toast makes you sing good : a memoir of ...,"Flinn, Kathleen",2014.,"Viking,","Cooking American Midwestern style, Flinn Kathl...",nanf
13,The only child : a novel / Andrew Pyper.,"Pyper, Andrew",2017.,"Simon & Schuster,","Forensic psychiatrists Fiction, Psychological ...",cafic


Most books published and printed in this item type are during the years 2011 to 2017

In [49]:
adult_book['PublicationYear'].value_counts().sort_values(ascending = False).head(10)

[2016]    25
2015.     24
2016.     23
2014.     23
2017.     18
c2010.    17
[2015]    16
c2011.    15
2013.     15
[2017]    14
Name: PublicationYear, dtype: int64

"ItemCollection" is a collection code from the catalog record which describes the item. Here are some common examples: nanf (adult non-fiction), nafic(adult fiction), ncpic(children's picture book),  nycomic (Young adult comic books). 

Types of collection in the adult books category 

In [50]:
adult_book['ItemCollection'].value_counts().head(10)

canf       151
nanf        93
nafic       46
cafic       26
caln        18
nanew       12
nyfic       12
camys       11
nycomic     11
namys        9
Name: ItemCollection, dtype: int64

Publisher with the most number of publications in this item category

In [51]:
adult_book['Publisher'].value_counts().head()

Random House,               8
Thorndike Press,            8
St. Martin's Press,         8
Oxford University Press,    6
Simon & Schuster,           5
Name: Publisher, dtype: int64

Excluding the unknown counts (where the subject of the book is unknown or belong to multipe categories). Leadership , Short stories and domestic fiction and art have 2 publictions each in the adult book item type 

In [52]:
adult_book['Subjects'].value_counts().head()

Unknown                                                                                                           19
Leadership                                                                                                         2
Short stories                                                                                                      2
Sisters Fiction, Domestic fiction                                                                                  2
Art Study and teaching Higher United States, Chicago Judy 1939 Themes motives, Artists United States Biography     2
Name: Subjects, dtype: int64

Total Books in the category and unique books in the category

In [57]:
print('Total books in the category : {}'.format(adult_book['Title'].count()))
print('Unique books in this category : {}'.format(adult_book['Title'].nunique()))

Total books in the category : 464
Unique books in this category : 463


A book with the same title has two entries

Distinct authors and no. of publications

In [54]:
adult_book['Author'].value_counts().head()

Unknown                    41
Patterson, James, 1947-     2
Chicago, Judy, 1939-        2
Wolfe, Ethan J.             1
Hrachovec, Anna             1
Name: Author, dtype: int64

Two authors with more than one publications

In [55]:
adult_book.loc[adult_book['Author'] == 'Chicago, Judy, 1939-']

Unnamed: 0,Title,Author,PublicationYear,Publisher,Subjects,ItemCollection
295,Institutional time : a critique of studio art ...,"Chicago, Judy, 1939-",[2014],"The Monacelli Press,","Art Study and teaching Higher United States, C...",nanf
460,Institutional time : a critique of studio art ...,"Chicago, Judy, 1939-",[2014],"The Monacelli Press,","Art Study and teaching Higher United States, C...",canf


Coincidentally it is Chicago, Judy's 1939 book 'Institutional time' that has two entries both under different collection type

In [56]:
adult_book.loc[adult_book['Author'] == 'Patterson, James, 1947-']

Unnamed: 0,Title,Author,PublicationYear,Publisher,Subjects,ItemCollection
416,The black book / James Patterson and David Ellis.,"Patterson, James, 1947-",2017.0,"Little, Brown and Company,","Police Family relationships Fiction, Murder In...",cafic
938,Bullseye / James Patterson and Michael Ledwidge.,"Patterson, James, 1947-",2016.0,"Little, Brown and Company,","Bennett Michael Fictitious character Fiction, ...",calpfic


So, James Patterson is the only one to have authored two books in this item type in the collection.

This was just introductory exploratory analysis and some data cleaning. More to come in this kernel with time.
Any suggestions or questions regarding the code are welcome! 
Happy Kaggling..