In [104]:
import pandas as pd
import math

In [123]:
def process_isbn(isbn):
    if type(isbn) == float:
        isbn = str(isbn)
    if len(isbn) != 10:
        isbn = '0'*(10-len(isbn)) + isbn
    assert len(isbn) == 10
    isbn = str(isbn)[:-1]
    isbn = isbn[3:]
    try:
        int(isbn)
    except:
        print('err', isbn)
    return isbn

In [124]:
df1 = pd.read_csv('dylancastillo/books.csv', usecols = ['isbn10', 'description', 'categories'])
df1.columns = ['isbn', 'description', 'categories']
df1['isbn'] = df1['isbn'].apply(process_isbn)
print(len(df1))
df1.head(3)

6810


Unnamed: 0,isbn,description,categories
0,200588,Fiction,A NOVEL THAT READERS and critics have been eag...
1,226198,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...
2,616383,American fiction,Volume Two of Stephen Donaldson's acclaimed se...


The last digit of the 10-digit ISBN is a checksum. We will drop it.

In [125]:
df2 = pd.read_csv('GoodReads/books.csv', index_col=0)
df2 = df2.drop(['isbn13', 'language_code'], axis=1)
df2['isbn'] = df2['isbn'].apply(process_isbn)
print(len(df2))
df2.head(3)

11127


Unnamed: 0_level_0,title,authors,average_rating,isbn,num_pages,ratings_count,text_reviews_count,publication_date,publisher
bookID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,978596,652,2095690,27591,9/16/2006,Scholastic Inc.
2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,935807,870,2153167,29221,9/1/2004,Scholastic Inc.
4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,955489,352,6333,244,11/1/2003,Scholastic


In [126]:
df = pd.merge(df1, df2, on='isbn', how='right')

In [127]:
# Import bestseller data
bestsellers = pd.read_csv('NYTBest/bestsellers.csv')
bestsellers = bestsellers[['isbn10', 'rank', 'price', 'weeks_on_list', 'title', 'description', 'published_date']]
bestsellers.columns = ['isbn', 'rank', 'price', 'weeks_on_list', 'title', 'description', 'published_date']
# Must have an ISBN to enter this ride
bestsellers = bestsellers.dropna(subset=['isbn'])
bestsellers = bestsellers[~bestsellers['isbn'].str.startswith('isbn10 mu')]
# Convert ISBN to 9 digit standard we expect
bestsellers['isbn'] = bestsellers['isbn'].apply(process_isbn)
bestsellers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57975 entries, 0 to 61429
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn            57975 non-null  object 
 1   rank            57975 non-null  int64  
 2   price           57975 non-null  float64
 3   weeks_on_list   57975 non-null  int64  
 4   title           57975 non-null  object 
 5   description     50288 non-null  object 
 6   published_date  57975 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 3.5+ MB


In [128]:
bestsellers = bestsellers.sort_values(by='weeks_on_list', ascending=False)
bestsellers['published_date'] = pd.to_datetime(bestsellers['published_date'])
bestsellers = bestsellers.drop_duplicates(subset='isbn', keep='first').reset_index(drop = True)
bestsellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8081 entries, 0 to 8080
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   isbn            8081 non-null   object        
 1   rank            8081 non-null   int64         
 2   price           8081 non-null   float64       
 3   weeks_on_list   8081 non-null   int64         
 4   title           8081 non-null   object        
 5   description     7410 non-null   object        
 6   published_date  8081 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 442.1+ KB


In [129]:
set(bestsellers['isbn']).intersection(set(df2['isbn']))

{'003205',
 '003341',
 '004360',
 '010583',
 '011440',
 '024560',
 '027633',
 '027634',
 '028923',
 '033051',
 '046209',
 '083865',
 '089922',
 '102031',
 '112241',
 '116925',
 '120752',
 '122565',
 '123822',
 '124189',
 '141707',
 '152930',
 '153009',
 '153311',
 '153513',
 '153514',
 '154174',
 '154291',
 '155548',
 '161875',
 '171825',
 '179280',
 '188394',
 '195717',
 '195795',
 '207320',
 '215216',
 '215711',
 '221720',
 '221904',
 '227083',
 '228068',
 '232119',
 '236061',
 '240612',
 '240619',
 '240698',
 '242493',
 '242674',
 '266408',
 '280197',
 '285849',
 '303475',
 '303841',
 '310334',
 '312129',
 '312131',
 '324117',
 '324647',
 '324754',
 '327075',
 '335315',
 '338016',
 '349671',
 '485427',
 '512326',
 '512560',
 '513435',
 '519045',
 '519983',
 '520695',
 '521365',
 '526378',
 '526476',
 '533599',
 '534497',
 '542433',
 '542982',
 '548680',
 '549478',
 '550421',
 '550452',
 '551550',
 '551597',
 '552274',
 '553639',
 '554001',
 '558642',
 '571457',
 '580348',
 '601066',

In [130]:
book_set = pd.merge(df, bestsellers, on='isbn', how='left')
book_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11158 entries, 0 to 11157
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   isbn                11158 non-null  object        
 1   description_x       5698 non-null   object        
 2   categories          5597 non-null   object        
 3   title_x             11158 non-null  object        
 4   authors             11158 non-null  object        
 5   average_rating      11158 non-null  float64       
 6   num_pages           11158 non-null  int64         
 7   ratings_count       11158 non-null  int64         
 8   text_reviews_count  11158 non-null  int64         
 9   publication_date    11158 non-null  object        
 10  publisher           11158 non-null  object        
 11  rank                123 non-null    float64       
 12  price               123 non-null    float64       
 13  weeks_on_list       123 non-null    float64   

In [131]:
book_set

Unnamed: 0,isbn,description_x,categories,title_x,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_date,publisher,rank,price,weeks_on_list,title_y,description_y,published_date
0,978596,Juvenile Fiction,When Harry Potter and the Half-Blood Prince op...,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,652,2095690,27591,9/16/2006,Scholastic Inc.,,,,,,NaT
1,935807,Juvenile Fiction,"In Harry Potter and the Order of the Phoenix, ...",Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,9/1/2004,Scholastic Inc.,3.0,0.0,330.0,HARRY POTTER,A wizard hones his conjuring skills in the ser...,2015-08-02
2,955489,Juvenile Fiction,When the Chamber of Secrets is opened again at...,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,352,6333,244,11/1/2003,Scholastic,,,,,,NaT
3,965548,Juvenile Fiction,"For twelve long years, the dread fortress of A...",Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,435,2339585,36325,5/1/2004,Scholastic Inc.,,,,,,NaT
4,968258,Juvenile Fiction,The first five years of Harry Potter magic are...,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,2690,41428,164,9/13/2004,Scholastic,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,025441,Fiction,"No stranger to living and writing on the edge,...",Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,512,156,20,12/21/2004,Da Capo Press,,,,,,NaT
11154,011087,Fiction,This comic and surreal novel about the beastli...,You Bright and Risen Angels,William T. Vollmann,4.08,635,783,56,12/1/1988,Penguin Books,,,,,,NaT
11155,013196,,,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,415,820,95,8/1/1993,Penguin Books,,,,,,NaT
11156,087882,,,Poor People,William T. Vollmann,3.72,434,769,139,2/27/2007,Ecco,,,,,,NaT


In [133]:
book_set[~book_set.weeks_on_list.isnull()]

Unnamed: 0,isbn,description_x,categories,title_x,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_date,publisher,rank,price,weeks_on_list,title_y,description_y,published_date
1,935807,Juvenile Fiction,"In Harry Potter and the Order of the Phoenix, ...",Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,9/1/2004,Scholastic Inc.,3.0,0.00,330.0,HARRY POTTER,A wizard hones his conjuring skills in the ser...,2015-08-02
284,112241,Fiction,"""My heart is afraid that it will have to suffe...",The Alchemist,Paulo Coelho/Alan R. Clarke/Özdemir İnce,3.86,197,1631221,55843,5/1/1993,HarperCollins,4.0,0.00,344.0,THE ALCHEMIST,"In this fable, a Spanish shepherd boy ventures...",2015-03-08
286,116925,,,Fullmetal Alchemist Vol. 3 (Fullmetal Alchemi...,Hiromu Arakawa/Akira Watanabe,4.56,192,16666,299,9/13/2005,VIZ Media LLC,4.0,0.00,0.0,STEAL LIKE AN ARTIST,Presenting 10 principles to help readers disco...,2015-04-26
318,542433,Business & Economics,An updated edition of the best-selling guide f...,Think and Grow Rich: The Landmark Bestseller N...,Napoleon Hill,4.18,320,88897,2334,9/1/2005,Tarcherperigee,3.0,50.00,25.0,BUILDING STORIES,This is not your typical graphic novel: this b...,2013-05-05
444,630783,,,Euripides IV: Rhesus / The Suppliant Women / O...,Euripides/David Grene/Richmond Lattimore/Willi...,4.21,307,560,8,11/15/1968,University of Chicago Press,5.0,13.99,3.0,NATIONAL GEOGRAPHIC KIDS ALMANAC 2012,Fascinating facts.,2011-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10734,512560,Fiction,Ninety-something-year-old Jacob Jankowski reme...,Water for Elephants,Sara Gruen,4.09,335,1260027,52759,5/1/2007,Algonquin Books,5.0,13.95,132.0,WATER FOR ELEPHANTS,Distraught after the death of his parents in a...,2011-08-21
10762,819328,,,La regina dei dannati,Anne Rice/Roberta Rambelli,3.89,507,333,18,2/1/1997,TEA,3.0,0.00,1.0,BLOOD VOW,A recruit to the Black Dagger Brotherhood is e...,2016-12-25
10861,310334,Juvenile Fiction,"When Camp Half-Blood, the only safe haven for ...",The Sea of Monsters (Percy Jackson and the Oly...,Rick Riordan,4.24,280,10551,1205,4/1/2007,Disney Hyperion Books,4.0,0.00,176.0,PERCY JACKSON & THE OLYMPIANS,Children of the gods battle monsters of myth.,2010-12-05
10942,033051,Religion,Five volumes on sanctification published under...,The Complete Green Letters,Miles J. Stanford,4.20,336,293,32,1/1/1984,Zondervan,2.0,0.00,1.0,TALON,,2016-03-27
