# Pulling subject/genre information

This file contains the code used to pull subject information and merge it into the Goodreads data.

The main part of this code will later be included in an adjacent python script that doesn't include all the testing and fluff.

In [1]:
import json
import pandas as pd
import numpy as np
import re

In [37]:
## Initial arguments at the top
input_file = input("Please input file name (no .csv) containing book meta data:\n")
min_k = input("Please select the minimum number of occurences of a subject to be included as a feature\n"
            + "(subject features with occurences below the specified value will be dropped):\n")
output_file = input("Please input file name (no .csv) to output final dataframe to:\n")

Please input file name (no .csv) containing book meta data:
book_data_full_isbn
Please select the minimum number of occurences of a subject to be included as a feature
(subject features with occurences below the specified value will be dropped):
3
Please input file name (no .csv) to output final dataframe to:
final


## Goodreads Data

### Pull book data

In [38]:
## Load Goodreads data

#filename = 'book_data_full_isbn.csv'
directory = '../data/'
gr = pd.read_csv(directory + input_file + ".csv")

Some isbn13 numbers are recorded as ASIN (Amazon Standard Identification Number) values. For now, these are being removed in the cleaning step, but it may be a good idea to figure out how to work around this.

### Clean data

#### Cleaning ISBNs

Some ISBN numbers have 'X' as the 13th digit. Apparently this is supposed to belong to only ISBN 10 numbers. It's possible that Open Library has somehow 'constructed' their own ISBN 13 identifiers from existing ISBN 10 numbers when the ISBN 13 identifiers were missing.
Whether or not these should be included will depend on what ISBN values are in the Good Reads data. Currently, any ISBN numbers with alpha digits have been removed from the Goodreads data however. So far, none of them are 13 digit ISBNs that end in X.

In [3]:
## ISBN Numbers

## Change non-valid isbn numbers to "None"
letters = re.compile("[A-Za-z]")
e_12 = re.compile("E\+12")

for i in range(len(gr)):
    if letters.search(gr.isbn13[i]) != None:
        if e_12.search(gr.isbn13[i]) == None:
            gr.loc[i, 'isbn13'] = 'None'

            
## Remove rows with missing (and non-valid) isbn numbers
gr = gr[gr.isbn13 != 'None']
gr.index = range(len(gr))


## Expand ISBN numbers from E+12 format
for i in range(len(gr)):
    gr.loc[i, 'isbn13'] = str(int(pd.to_numeric(gr.loc[i, 'isbn13'])))

---

#### Cleaning Goodreads IDs

The Goodreads IDs scraped from goodreads do not match up with the IDs stored in OL. This code will stay here for now as reference in case something changes and we do want to use this identifier. But otherwise it will currently be commented out.

In [4]:
### Goodreads IDs
#
### Pull out Goodreads ids from 'editions_url' column
#gr_id_patt = re.compile("(?<=/work/editions/)[0-9]+")
#
#id_vec = []
#
#for ed in gr.editions_url:
#    gr_id = gr_id_patt.search(ed)
#    if gr_id != None:
#        id_vec.append(gr_id[0])
#    else:
#        id_vec.append(None)
#        
#gr["gr_id"] = id_vec
#
### Drop missing gr_id rows (for now)
#gr.drop(gr.loc[gr["gr_id"].isnull()].index, inplace = True)

---

## Open Library python package

This can take individual ISBN values (or other identifiers) and return the associated json file.

Issues:
- It is likely slower than using the bulk file since it will have to re-query for each line.
- Any slowness is going to scale up as data increases

Good stuff:
- Some books have multiple ISBNs that are close in values. The bulk json files make it difficult to use these slighlty different ISBN numbers to pull data. The OL package immediately recognizes the similar ISBN values and pulls data accordingly.
- It can pull based on other identifiers *** To test: can we used the Amazon identifiers to get a few more books in there? If so, the above cleaning method will need to be updated to seperate out observations with Amazon IDs.


https://github.com/internetarchive/openlibrary-client/blob/master/olclient/openlibrary.py

In [4]:
from olclient.openlibrary import OpenLibrary

In [5]:
## Example from OL documentation
ol = OpenLibrary()
work = ol.Work.get(u'OL12938932W')
editions = work.editions

In [6]:
## Example format for pulling book info for particular ISBN
isbn_test = ol.Edition.get(isbn = '9781477823835')

### Pull data from Open Library given Goodreads ISBNs

In [21]:
## ISBN numbers from Goodreads dataset
isbn_list = gr.isbn13.unique()

In [13]:
def ol_pull(ibsn_vec, keys = ["genres", "subjects"]):
    '''
    Take list of isbn values and return information from Open Library based on categories given by 'keys.'
    Outputs dataframe with isbn13 and values for each listed key.
    Books without data listed in 'keys' are not included in the output.
    '''
    ol_data = []

    for isbn in isbn_vec:
        book = ol.Edition.get(isbn = isbn)
        if book != None:
            book_dat = [book.json().get(key) for key in keys]
            ol_data.append([isbn] + book_dat)

    return(pd.DataFrame(ol_data, columns = ["isbn13"] + keyvec))

In [8]:
## Dictionary keys to pull
#keyvec = ["isbn_13", "title", "genres", "subjects", "description"]
keyvec = ["genres", "subjects"]

## For each ISBN, pull book data according to 'keyvec' and append to list
ol_data = []

for isbn in isbn_list:
    book = ol.Edition.get(isbn = isbn)
    if book != None:
        book_dat = [book.json().get(key) for key in keyvec]
        ol_data.append([isbn] + book_dat)

## Create dataframe to work with
isbn_df = pd.DataFrame(ol_data, columns = ["isbn13"] + keyvec)

Error retrieving OpenLibrary response: {'target': <function OpenLibrary._get_ol_response at 0x0000019F2E4488B0>, 'args': (<olclient.openlibrary.OpenLibrary object at 0x0000019F2E6BFF10>, '/api/books.json?bibkeys=ISBN:9780590431972'), 'kwargs': {}, 'tries': 5, 'elapsed': 7.480785}
Traceback (most recent call last):
  File "C:\Users\preston\anaconda3\lib\site-packages\backoff\_sync.py", line 94, in retry
    ret = target(*args, **kwargs)
  File "C:\Users\preston\anaconda3\lib\site-packages\olclient\openlibrary.py", line 139, in _get_ol_response
    response.raise_for_status()
  File "C:\Users\preston\anaconda3\lib\site-packages\requests\models.py", line 941, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: https://openlibrary.org/api/books.json?bibkeys=ISBN:9780590431972


HTTPError: 500 Server Error: Internal Server Error for url: https://openlibrary.org/api/books.json?bibkeys=ISBN:9780590431972

## Temporary test df

In [19]:
!ls ../output

gr_subjects.csv
subject_feats.csv


In [20]:
isbn_df = pd.read_csv("../output/gr_subjects.csv")

### Convert subject data to features

In [28]:
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer

#### Functions

In [22]:
def naive_feat_sel(df, k = 1):
    '''
    Drop subject features for which there exist k or less books having that subject
    '''
    sub_counts = df.sum(axis = 0)
    to_drop = sub_counts[sub_counts < k].index
    
    return(df.drop(columns = to_drop))

#### Cleaning

In [23]:
## Convert lists of subjects to singular clean string

## Pattern to remove any non-alphabetic characters
letter_patt = re.compile("[^A-Za-z \t]")

## Subject lists
sub_lists = isbn_df.subjects[isbn_df.subjects.notna()]
## Index for books with subjects
sub_index = sub_lists.index

## For each book, combine subject lists into one string and remove punctuation/digits
sub_text = ["".join(l).lower() for l in sub_lists]
sub_text = [re.sub(letter_patt, " ", t) for t in sub_text]

In [29]:
## Stem subjects for each book
lancaster = nltk.stem.LancasterStemmer()

stemmed_subs = []

for phrase in sub_text:
    stems = " ".join(set([lancaster.stem(word) for word in str.split(phrase)]))
    stemmed_subs.append(stems)

sub_clean = pd.Series(stemmed_subs, name = "clean_subjects", index = sub_index)

In [32]:
## Turn each possible subject word into a seperate feature and merge back to previous ISBNs

## Binary counts for each subject word
cv = CountVectorizer(stop_words = "english", binary = True)
cv.fit(sub_clean)
sub_feats = cv.transform(sub_clean)

## Change dense data to normal for sake of merging
col_names = np.sort(list(cv.vocabulary_.keys()))

nondense = pd.DataFrame(sub_feats.todense(), columns = col_names, index = sub_index)

In [34]:
## Assign cleaned subjects to original ISBNs
isbn_clean = isbn_df.join(sub_clean, how = "left")[["isbn13", "clean_subjects"]]

## Assign features to original ISBNs
isbn_feats = isbn_clean.join(nondense, how = "left")

## Remove features with little coverage
isbn_feats_clean = naive_feat_sel(isbn_feats, k = min_k)

### Match Statistics

Note that the following statistics are based on the number of valid ISBNs, not the number of observations in the original dataset. Invalid ISBNs, Amazon IDs, etc. have been excluded from the matching and are subsequently not included here. This upweights these statistics slightly.

In [13]:
## Direct match rate
nomatch = len(isbn_list) - len(isbn_df)
total = len(isbn_list)

print("Unmatched ISBN count:", nomatch)

print("Unmatched ISBN proportion:", round(nomatch / total * 100, 2), "%")

Unmatched ISBN count: 102
Unmatched ISBN proportion: 5.16 %


In [14]:
## Subject/genre rate
val_vec = []

for i in range(len(isbn_df)):
    val = 0
    if isbn_df.genres[i] != None:
        val += 1
    if isbn_df.subjects[i] != None:
        val += 2
#    if isbn_df.description[i] != None:
#        val += 4

    val_vec.append(val)
    
val_vec = np.array(val_vec)

In [15]:
## NOTE: len(val_vec) and total do not match in terms of length!
## That might have to do with unmatched ibsns--double check

In [16]:
noinfo = ((val_vec == 0)).sum()
print("Number of ISBNs with no subject or genre info:", noinfo, "(", round((noinfo / total) * 100, 2), "%)")

Number of ISBNs with no subject or genre info: 887 ( 44.84 %)


In [17]:
genre_info = ((val_vec == 1)).sum()
print("Number of ISBNs with only genre info:", genre_info, "(", round((genre_info / total) * 100, 2), "%)")

Number of ISBNs with only genre info: 0 ( 0.0 %)


In [18]:
sub_info = ((val_vec == 2)).sum()
print("Number of ISBNs with only subject info:", sub_info, "(", round((sub_info / total) * 100, 2), "%)")

Number of ISBNs with only subject info: 774 ( 39.13 %)


In [19]:
both_info = ((val_vec == 3)).sum()
print("Number of ISBNs with genre and subject info:", both_info, "(", round((both_info / total) * 100, 2), "%)")

Number of ISBNs with genre and subject info: 215 ( 10.87 %)


---

### Output merged dataset

In [160]:
isbn_feats_clean.shape

(1998, 348)

In [163]:
isbn_feats_clean.to_csv("../output/" + output_file + ".csv", index = False)

In [166]:
## Also, output a dataframe for every possible subject to decode stemmed features
uniq_subs = set(str.split(" ".join(sub_text)))
uniq_stems = [lancaster.stem(word) for word in uniq_subs]

decoder = pd.DataFrame({"stem":uniq_stems, "word":list(uniq_subs)})
decoder.sort_values(by = ["stem", "word"], inplace = True)

decoder.to_csv("../output/feature_decoder.csv", index = False)

---

## Open Library Bulk Data

---

### Basic look at data formatting

Data can be found here:

https://openlibrary.org/developers/dumps

On account of the dump files being gigantic, they are not currently stored in the Git repository. The following code assumes the 'editions' bulk file has been downloaded and stored in the 'data' directory.

---

### ISBN Merge

In [160]:
## Take ISBN numbers from Goodreads dataset
isbn_list = gr.isbn13.to_list()
isbn_patt = re.compile("(?<=isbn_13\': \[\').{13}")

In [124]:
## Pull appropriate key values from json file
keys = ["isbn_13", "title", "genre", "subjects"]
ol_vec = []

## For each json line, if it contains an ISBN value and that value
 # is in the Goodreads ISBNs, add it to a list
with open('../data/edition_json.txt') as json_file:
    for line in json_file:
        if (re.search('\"isbn_13\"', line) != None): 
            if isbn_patt.search(line) != None:
                isbn_num = isbn_patt.search(line)[0]
                if str(isbn_num) in isbn_list:
                    l = [json.loads(line).get(key) for key in keys]
                    ol_vec.append(l)

In [125]:
## Convert Open Library list to dataframe with properly formatted ISBN numbers
isbn_df = pd.DataFrame(ol_vec, columns = keys)

clean_isbn = []

for l in isbn_df.isbn_13:
    clean_isbn.append(l[0])

isbn_df["isbn13"] = clean_isbn

In [126]:
## Do a simple inner join to make sure things are lined up correctly
merge_check = gr.merge(isbn_df, on = 'isbn13')

In [127]:
## Final merge into original Goodreads dataset
gr_merge = gr.merge(isbn_df, how = 'left', on = 'isbn13')

Again, this method is not currently being used because it is easier to work with the Open Library package rather than the raw data. This is kept here in case the raw files are needed later.

---

### Goodreads ID Merge

Again, this has been shown to not be useful. But it is commented out in case it becomes useful otherwise.

In [78]:
### Take ISBN numbers from Goodreads dataset
#gr_id_list = gr.gr_id.to_list()
#gr_patt = re.compile("(?<=goodreads\": \[\")[0-9]+")
#
### Pull appropriate key values from json file
#keys = ["identifiers", "isbn_13", "title", "genres", "subjects"]
#ol_vec = []
#
#with open('../data/edition_json.txt') as json_file:
#    for line in json_file:
#        if (re.search('\"goodreads\"', line) != None): 
#            if gr_patt.search(line) != None:
#                gr_num = gr_patt.search(line)[0]
#                if gr_num in gr_id_list:
#                    l = [json.loads(line).get(key) for key in keys]
#                    ol_vec.append(l)
##
### Convert open library list to dataframe with properly formatted ISBN numbers
#ol_df = pd.DataFrame(ol_vec, columns = keys)
#
#gr_vec = []
#
#for i in range(len(ol_df)):
#    gr_vec.append(ol_df.identifiers[i]["goodreads"][0])
#    
#ol_df["gr_id"] = gr_vec
##ol_df.drop(columns = "identifiers", inplace = True)
#
#merge_check = gr.merge(ol_df, on = 'gr_id')
##merge_check
#
#gr_merge = gr.merge(ol_df, how = 'left', on = 'gr_id')
##gr_merge

---

## Scratch/checking

This is really just a spot to test out stuff and easily look at the content of the jsons.

In [12]:
## Pull json lines
keys = ["isbn_13", "title", "subjects"]
dat = []

with open('../data/test.txt') as json_file:
    for line in json_file:
        dat.append(line)

In [19]:
## Pull appropriate key values from json file
keys = ["identifiers", "isbn_13", "title", "subjects"]
ol = []

with open('../data/test.txt') as json_file:
    for line in json_file:
        if (re.search('\"goodreads\"', line) != None): 
            if gr_patt.search(line) != None:
                l = [json.loads(line).get(key) for key in keys]
                ol.append(l)

In [14]:
keys = ["isbn_13", "title", "subjects", "source_records"]
dat = []

with open('../data/test.txt') as json_file:
    for line in json_file:
        if re.search(re.compile('subject'), line) != None:
            l = [json.loads(line).get(key) for key in keys]
            dat.append(l)
