In [1]:
import pandas as pd
import spacy
import re
from fuzzywuzzy import fuzz
# Load the English language model in spaCy
nlp = spacy.load('en_core_web_sm')

In [2]:
def get_advances_data():
    
    adv = pd.read_excel('author_advance_dataset.xlsx', index_col=0)
    adv = adv[~adv['Advance'].isna()]
    deals = {
    'NICE' : (1+49000)/2,
    'VERY NICE' : (50000 + 99000)/2,
    'GOOD' : (100000 + 250000)/2,
    'SIGNIFICANT' : (251000 + 499000)/2,
    'MAJOR' : (500000 + 750000)/2
    }
    adv['Advance'] = adv['Advance'].replace(deals)

    adv.reset_index(inplace=True)
    adv.rename(columns={'index': 'advance_id'}, inplace=True) 
    return adv

def get_printbook_title_data():
    
    book = pd.read_csv('books/booktitle_printbooks_new.csv')
    book = book[['author','title','isbn13', 'datepublished']]
    book['datepublished']  = pd.to_datetime(book['datepublished'], errors = 'coerce')
    #book.rename(columns={'datepublished' : 'datepublished_booktitle_data'})
    
    #Get only the first published date for a given isbn13
    book = book.sort_values('datepublished')
    book = book.groupby('isbn13').first().reset_index()
    
    return book

def get_Ebook_title_data():
    
    book = pd.read_csv('books/booktitle_ebook_new.csv')
    book = book[['author','title','isbn13', 'datepublished']]
    book['datepublished']  = pd.to_datetime(book['datepublished'], errors = 'coerce')
    #book.rename(columns={'datepublished' : 'datepublished_booktitle_data'})
    
    #Get only the first published date for a given isbn13
    book = book.sort_values('datepublished')
    book = book.groupby('isbn13').first().reset_index()
    
    return book

def extract_author_name_1(text):
    doc = nlp(text)
    for entity in doc.ents:
        if entity.label_ == 'PERSON' and ("'s" in entity.text or "'" in entity.text):
            return entity.text
    return None


def extract_author_name_2(text):
    doc = nlp(text)
    for entity in doc.ents:
        if entity.label_ == 'PERSON' :#and ("'s" in entity.text or "'" in entity.text):
            return entity.text
    return None

def author_extraction(adv):
    adv['Author_extracted'] = adv['All'].apply(lambda x: extract_author_name_1(x))
    adv.loc[(adv['Author_extracted'].isna()) & ((adv['Author(s)'] == 'NONE') | (adv['Author(s)'] == 'All')), 'Author_extracted'] = adv.loc[(adv['Author_extracted'].isna()) & ((adv['Author(s)'] == 'NONE') | (adv['Author(s)'] == 'All')), 'All'].apply(lambda x: extract_author_name_2(x))
    adv.loc[(adv['Author_extracted'].isna()) & (adv['Author(s)'] != 'NONE'), 'Author_extracted'] = adv.loc[(adv['Author_extracted'].isna()) & (adv['Author(s)'] != 'NONE'), 'Author(s)'] 
    adv['Author_extracted'] = adv['Author_extracted'].str.replace("'s", "")
    adv['Author_extracted'] = adv['Author_extracted'].str.replace("'", "")
    adv['Author_extracted'] = adv['Author_extracted'].str.lower()
    return adv


def levenshtein_ratio(s1, s2):
    return fuzz.token_sort_ratio(s1, s2)

def get_GAPI_books():
    d1 = pd.read_csv('AuthorCrosswalkScraper/Google_Author_crosswalk_adv.csv')
    d1['Author'] = d1['Author'].astype(str)
    d2 = pd.read_csv('AuthorCrosswalkScraper/Google_Author_crosswalk_adv_17440.csv')
    d2['Author'] = d2['Author'].astype(str)
    d3 = pd.read_csv('AuthorCrosswalkScraper/Unmatched_extraction_of_authors.csv')
    d3['Author'] = d3['Author'].astype(str)
    d4 = pd.read_csv('AuthorCrosswalkScraper/Google_Author_crosswalk_adv_x.csv')
    d4['Author'] = d4['Author'].astype(str)
    d = pd.concat([d1,d2],axis=0)
    d = pd.concat([d,d3], axis=0)
    d = pd.concat([d,d4], axis=0)
    d['Author'] = d['Author'].astype(str)
    d['Author'] = d['Author'].str.replace("[","")
    d['Author'] = d['Author'].str.replace("]","")
    d['Author'] = d['Author'].str.replace("'","")
    d['author_l'] = d['Author'].str.lower()
    d = d.drop(columns='Unnamed: 0')
    d.columns = ['author','title','datepublished','isbn13','author_l']
    
    return d

def merge_book_advances(book, adv):
    
    book_adv = pd.merge(book, adv, left_on='author_l', right_on='Author_extracted')
    book_adv['fuzz_ratio'] = book_adv.apply(lambda x : levenshtein_ratio(str(x['Title']).lower(), str(x['title']).lower()), axis=1)
    
    extract_all_caps = lambda text: ' '.join(re.findall(r'\b[A-Z]{2,}\b', text))
    book_adv['title_extracted'] = book_adv['All'].apply(extract_all_caps)
    
    book_adv = book_adv[~((book_adv['Title'] == 'NONE') & (book_adv['title_extracted']=="")) ]
    book_adv.loc[book_adv['Title'] == 'NONE', 'fuzz_ratio'] = book_adv.loc[book_adv['Title'] == 'NONE'].apply(lambda x : levenshtein_ratio(str(x['title_extracted']).lower(), str(x['title']).lower()), axis=1)
    
    book_adv['datepublished'] = book_adv['datepublished'].astype(str)
    book_adv['datepublished'] = pd.to_datetime(book_adv['datepublished'], errors='coerce')
    book_adv['datepublished'] = book_adv['datepublished'].apply(lambda x: x.date() if pd.notnull(x) and hasattr(x, 'date') else pd.NaT if pd.isnull(x) else pd.to_datetime(x[:10]).date() if len(x) == 10 else pd.to_datetime(x + '-12-31').date() if len(x) == 4 else pd.to_datetime(x + '-01').date() + pd.offsets.MonthEnd() if len(x) == 7 else x)
    '''
    book_adv_grouped = book_adv.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
    book_adv_grouped = book_adv_grouped.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
    '''
    book_adv_grouped = book_adv
    book_adv_grouped['datepublished'] = pd.to_datetime(book_adv_grouped['datepublished'])
    return book_adv_grouped

In [3]:
adv = get_advances_data()

In [4]:
adv = author_extraction(adv)

In [5]:
print(adv.shape, adv['advance_id'].unique().shape, adv['Author_extracted'].unique().shape)

(34297, 17) (34297,) (22843,)


In [6]:
d = get_GAPI_books()

In [7]:
d.columns

Index(['author', 'title', 'datepublished', 'isbn13', 'author_l'], dtype='object')

In [8]:
print(d.shape, d['author_l'].unique().shape, d['isbn13'].unique().shape)

(305952, 5) (139705,) (289390,)


In [9]:
merg = merge_book_advances(d, adv)

In [10]:
merg.columns

Index(['author', 'title', 'datepublished', 'isbn13', 'author_l', 'advance_id',
       'Rights Category', 'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All',
       'Author_extracted', 'fuzz_ratio', 'title_extracted'],
      dtype='object')

In [11]:
print(merg.shape, merg['advance_id'].unique().shape, merg['isbn13'].unique().shape, merg['author_l'].unique().shape)

(251464, 24) (22419,) (120868,) (14198,)


In [12]:
f = merg[merg['fuzz_ratio']>=95]
f = f.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
f = f.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [13]:
print(f.shape, f['advance_id'].unique().shape, f['isbn13'].unique().shape, f['author_l'].unique().shape)

(8217, 24) (8217,) (8217,) (7213,)


In [14]:
f

Unnamed: 0,author,title,datepublished,isbn13,author_l,advance_id,Rights Category,Genre,Date,Author(s),...,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,Author_extracted,fuzz_ratio,title_extracted
0,Susan Arnout Smith,The Timer Game,2008-01-01,9780007265176,susan arnout smith,127148,Fiction,Mystery/ Crime,2012-12-11,Susan Arnout Smith,...,,,,,,,"Susan Arnout Smith's THE TIMER GAME, after giv...",susan arnout smith,100,THE TIMER GAME US
1,Bel Mooney,Small Dogs Can Save Your Life,2010-01-01,9780007318704,bel mooney,163069,International rights,UK Non-fiction,2008-10-23,Bel Mooney,...,YES,,,,,,Daily Mail advice columnist Bel Mooney's SMALL...,bel mooney,100,SMALL DOGS CAN SAVE YOUR LIFE UK US
2,Philip Mould,Sleuth,2011-01-01,9780007319152,philip mould,158567,Non-fiction,True crime,2009-05-29,Philip Mould,...,YES,,,,,,International art dealer and BBC Antiques Road...,philip mould,100,BBC SLEUTH UK
3,Janice Hardy,The Pain Merchants,2010-01-01,9780007326792,janice hardy,165232,Children's,Fantasy,2008-06-26,Janice Hardy,...,YES,,,,YES,,Janice Hardy's debut fantasy trilogy beginning...,janice hardy,100,THE PAIN MERCHANTS
4,Ellen Horan,31 Bond Street,2010-01-01,9780007353040,ellen horan,164724,Fiction,Debut,2008-07-28,Ellen Horan,...,YES,,,,,,"Ellen Horan's 31 BOND STREET, interweaving fic...",ellen horan,100,BOND STREET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8212,John Coates,The Problem of Twelve,NaT,9798987053546,john coates,150,Non-fiction,Business/ Finance/ Economics,2023-03-10,John Coates,...,,,,,,,Professor of law and economics at Harvard Law ...,john coates,100,THE PROBLEM OF TWELVE WHEN FEW FINANCIAL INSTI...
8213,Ganesh Sitaraman,Why Flying Is Miserable,2023-01-01,9798987053584,ganesh sitaraman,149,Non-fiction,Business/ Finance/ Economics,2023-03-10,Ganesh Sitaraman,...,,,,,,,Law professor at Vanderbilt Law School and aut...,ganesh sitaraman,100,THE CRISIS OF THE MIDDLE CLASS CONSTITUTION TH...
8214,Rain Nox,Animal Charmer,NaT,9798987208359,rain nox,13637,Digital Fiction,Romance,2022-03-24,Rain Nox,...,,,,,YES,,Fronterafest Best of Fest playwright Rain Nox'...,rain nox,100,ANIMAL CHARMER THE GOOD WITCH DOCTOR DOOLITTLE
8215,Jane Anthony,P.S. I Hate You,NaT,9798987262108,jane anthony,2668,International rights,Fiction,2023-01-12,NONE,...,,,,,,,"Jane Anthony's P.S. I HATE YOU, to Queen (Ital...",jane anthony,100,HATE YOU SBR


In [15]:
filtered_merg = merg[((merg['datepublished'] - merg['Date']).dt.days <=1460) & ((merg['datepublished'] - merg['Date']).dt.days >=90) & (merg['fuzz_ratio']>=80)]
filtered_merg = filtered_merg.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
filtered_merg = filtered_merg.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [16]:
print(filtered_merg.shape, filtered_merg['advance_id'].unique().shape, filtered_merg['isbn13'].unique().shape, filtered_merg['author_l'].unique().shape)

(5202, 24) (5202,) (5202,) (4642,)


In [17]:
df = pd.concat([f,filtered_merg], axis=0)
df = df.drop_duplicates()
df = df.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
df = df.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [18]:
print(df.shape, df['advance_id'].unique().shape, df['isbn13'].unique().shape, df['author_l'].unique().shape)

(8730, 24) (8730,) (8730,) (7576,)


In [31]:
rem = merg[(~merg['isbn13'].isin(df['isbn13'])) & (~merg['advance_id'].isin(df['advance_id']))]

In [32]:
rem

Unnamed: 0,author,title,datepublished,isbn13,author_l,advance_id,Rights Category,Genre,Date,Author(s),...,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,Author_extracted,fuzz_ratio,title_extracted
0,Terry Spear,The Best of Both Wolves,2022-01-01,9781728228822,terry spear,128632,Fiction,Romance,2012-10-15,Terry Spear,...,,,,,,,"Terry Spear's A SEAL WOLF CHRISTMAS, about a c...",terry spear,36,SEAL WOLF CHRISTMAS
1,Terry Spear,Savage Hunger,2012-01-01,9781402266935,terry spear,128632,Fiction,Romance,2012-10-15,Terry Spear,...,,,,,,,"Terry Spear's A SEAL WOLF CHRISTMAS, about a c...",terry spear,29,SEAL WOLF CHRISTMAS
2,Terry Spear,Heart of the Wolf,2008-01-01,9781402233852,terry spear,128632,Fiction,Romance,2012-10-15,Terry Spear,...,,,,,,,"Terry Spear's A SEAL WOLF CHRISTMAS, about a c...",terry spear,53,SEAL WOLF CHRISTMAS
3,Terry Spear,Highland Rake,2012-01-01,9781480259034,terry spear,128632,Fiction,Romance,2012-10-15,Terry Spear,...,,,,,,,"Terry Spear's A SEAL WOLF CHRISTMAS, about a c...",terry spear,29,SEAL WOLF CHRISTMAS
4,Terry Spear,Terry Spear’s Wolf Bundle,2010-01-01,9781402262401,terry spear,128632,Fiction,Romance,2012-10-15,Terry Spear,...,,,,,,,"Terry Spear's A SEAL WOLF CHRISTMAS, about a c...",terry spear,48,SEAL WOLF CHRISTMAS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265863,L. Divine,Drama High: Keep It Movin',2009-01-01,9780758231079,l. divine,168472,Children's,Young Adult Fiction,2008-01-21,L. Divine,...,,,,,,,"L. Divine's DRAMA HIGH, the next eight books i...",l. divine,59,DRAMA HIGH
265864,L. Divine,Drama High: The Fight,2006-01-01,9780758216342,l. divine,168472,Children's,Young Adult Fiction,2008-01-21,L. Divine,...,,,,,,,"L. Divine's DRAMA HIGH, the next eight books i...",l. divine,67,DRAMA HIGH
265865,L. Divine,"Drama High, V19",2020-01-01,9780985736859,l. divine,168472,Children's,Young Adult Fiction,2008-01-21,L. Divine,...,,,,,,,"L. Divine's DRAMA HIGH, the next eight books i...",l. divine,83,DRAMA HIGH
265866,L. Divine,No Mercy,2013-01-01,9780985736811,l. divine,168472,Children's,Young Adult Fiction,2008-01-21,L. Divine,...,,,,,,,"L. Divine's DRAMA HIGH, the next eight books i...",l. divine,22,DRAMA HIGH


In [41]:
rem = d[~d['isbn13'].isin(df['isbn13'])]

In [42]:
counts = rem['author_l'].value_counts()
remx = rem[rem['author_l'].map(counts) == 1]


In [None]:
remx

In [45]:
remx

Unnamed: 0,author,title,datepublished,isbn13,author_l
47,Courtney A. Short,Uniquely Okinawan,2020,9780823288403,courtney a. short
395,"Tammy Donham, Amy Sue Macy, Clyde Philip Rolston",Marketing Recorded Music,2022,9781000585148,"tammy donham, amy sue macy, clyde philip rolston"
396,"John Caldwell Calhoun, Clyde Norman Wilson",The Papers of John C. Calhoun,1959,9780872494831,"john caldwell calhoun, clyde norman wilson"
398,Clyde R. Forsberg Jr.,Divine Rite of Kings,2016,9781443889575,clyde r. forsberg jr.
399,Clyde W. Franklin II,The Changing Definition of Masculinity,2012,9781461327219,clyde w. franklin ii
...,...,...,...,...,...
29972,Debra M Amidon,The Innovation SuperHighway,2007,9781136357343,debra m amidon
29973,Debra Hickenlooper Sowell,Christensen Brothers,2014,9781134422548,debra hickenlooper sowell
29974,Debra Monroe,On the Outskirts of Normal,2015,9780820349114,debra monroe
29975,"Andrew R. Hoehn, Richard H. Solomon, Sonni Efr...",Strategic Choices for a Turbulent World,2017,9780833096937,"andrew r. hoehn, richard h. solomon, sonni efr..."


In [43]:
remx = remx[((remx['datepublished'] - remx['Date']).dt.days <=1460) & ((remx['datepublished'] - remx['Date']).dt.days >=90) & (remx['author_l'].isin(df['author_l']))]

KeyError: 'Date'

In [39]:
print(remx.shape, remx['advance_id'].unique().shape, remx['isbn13'].unique().shape, remx['author_l'].unique().shape)

(0, 24) (0,) (0,) (0,)


In [23]:
final = pd.concat([df, remx], axis = 0)
final = final.drop_duplicates()
final = final.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
final = final.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [24]:
final.columns

Index(['author', 'title', 'datepublished', 'isbn13', 'author_l', 'advance_id',
       'Rights Category', 'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All',
       'Author_extracted', 'fuzz_ratio', 'title_extracted'],
      dtype='object')

In [25]:
print(final.shape, final['advance_id'].unique().shape, final['isbn13'].unique().shape, final['author_l'].unique().shape)

(8730, 24) (8730,) (8730,) (7576,)


In [88]:
df.to_csv('true_merge.csv')

In [49]:
#step 1
d = get_GAPI_books()
#step 2
adv = get_advances_data()
adv = author_extraction(adv)
#step 3
merg = merge_book_advances(d, adv)
#step 4
f = merg[merg['fuzz_ratio']>=95]
f = f.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
f = f.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
#step 5
filtered_merg = merg[((merg['datepublished'] - merg['Date']).dt.days <=1460) & ((merg['datepublished'] - merg['Date']).dt.days >=90) & (merg['fuzz_ratio']>=80)]
filtered_merg = filtered_merg.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
filtered_merg = filtered_merg.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

#step 6
df = pd.concat([f,filtered_merg], axis=0)
df = df.drop_duplicates()
df = df.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
df = df.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

#step 7
rem = merg[(~merg['isbn13'].isin(df['isbn13'])) & (~merg['advance_id'].isin(df['advance_id']))]
rem = merg[((merg['datepublished'] - merg['Date']).dt.days <=1460) & ((merg['datepublished'] - merg['Date']).dt.days >=90)]
df = pd.concat([df,rem], axis=0)
df = df.drop_duplicates()
df = df.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
df = df.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [54]:
print(df.shape, df['advance_id'].unique().shape, df['isbn13'].unique().shape, df['author_l'].unique().shape)

(15308, 24) (15308,) (15308,) (11810,)


In [55]:
adv[~adv['Author_extracted'].isin(d['author_l'])]

Unnamed: 0,advance_id,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,Author_extracted
2,7,Fiction,Horror,2023-03-15,Chance Forshee,A TERRIFYING TOME OF TERROR,"['Jennifer Barnes', 'Raw Dog Screaming Press']",,24500.5,,,,,,,"The Curator of Horror, a bookseller, and host ...",chance forshee
3,13,Children's,Picture Book Fiction,2023-03-15,"['Debra Buschman', 'Tom Uleau']",THE KNIGHT & HIS TRUSTY,"['Michele McAvoy', 'The Little Press']",,24500.5,,,,,,,SCBWI Wisconsin Regional Advisor Debra Buschma...,"[debra buschman, tom uleau]"
4,22,Children's,Graphic Novel,2023-03-14,Junepurrr,SUBZERO,"['Desiree Rodriguez', 'Oni Press', 'Britt Sies...",,74500.0,,,,,,,Creator of the webcomic series of the same nam...,junepurrr
5,42,Non-fiction,Food/ Beverage,2023-03-14,Sarah Thrush,PRESERVATION WITH A PURPOSE,"['Dan Rosenberg', 'Harvard Common Press', 'Jan...",,24500.5,YES,,,,,,"Homesteading economist, teacher, and naturalis...",sarah thrush
10,56,Fiction,Debut,2023-03-14,Carinn Jade,THE ASTROLOGY HOUSE,"['Natalie Hallak', 'Atria', 'Claire Friedman',...",,375000.0,YES,,,,,,Pop Fiction Women podcast host Carinn Jade's T...,carinn jade
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34289,168745,Fiction,Inspirational,2008-01-08,Kathleen Y'Barbo,THE BELOVED CAPTIVE,"['Rebecca Germany', 'Barbour', 'Wendy Lawton',...",,24500.5,,,,,,,"Kathleen Y'Barbo's THE BELOVED CAPTIVE, to Reb...",kathleen ybarbo
34291,168759,Fiction,Romance,2008-01-07,Cindy Procter-King,KISS ME AT MIDNIGHT,"['Theresa Stevens', 'Red Sage']",,24500.5,,,,,,,Cindy Procter-King writing as Kate St. James's...,kate st. james
34292,168767,Non-fiction,History,2008-01-07,Stefan Aust,BAADER-MEINHOF,"['Cybele Tom', 'Oxford University Press']",,24500.5,,,,,,,Der Spiegel's Editor-in-Chief Stefan Aust's BA...,der spiegel
34295,168797,Non-fiction,How-To,2008-01-04,"['Linda Meyers', 'John Meyers']",TOSSED & FOUND,"['Dervla Kelly', 'Stewart, Tabori & Chang', 'S...",,24500.5,,,,,,,Interior designers Linda Meyers and John Meyer...,john meyers


In [102]:
adv_eth = author_ethnicity(adv)

In [104]:
adv_eth['Date'] = pd.to_datetime(adv_eth['Date']).dt.strftime('%Y-%m')
adv_eth['Date'] = pd.to_datetime(adv_eth['Date'])

In [107]:
adv_eth

Unnamed: 0,advance_id,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,...,Series,All,Author_extracted,First Name,Last Name,asian,hispanic,nh_black,nh_white,race
0,2,International rights,UK Fiction,2023-03-01,Kynpham Sing Nongkynrih,FUNERAL NIGHTS,"['Stefan Tobler', 'And Other Stories', 'Tara T...",,24500.5,,...,,Indian writer Kynpham Sing Nongkynrih's FUNERA...,kynpham sing nongkynrih,Kynpham,Nongkynrih,0.048560,0.014498,0.018025,0.918917,nh_white
1,7,Fiction,Horror,2023-03-01,Chance Forshee,A TERRIFYING TOME OF TERROR,"['Jennifer Barnes', 'Raw Dog Screaming Press']",,24500.5,,...,,"The Curator of Horror, a bookseller, and host ...",chance forshee,Chance,Forshee,0.002367,0.005476,0.105410,0.886747,nh_white
2,4,International rights,Fiction,2023-03-01,NONE,THE SHADOW MURDERS,"['Fantasy Foundation', 'Ann Huang', 'The Grayh...",,24500.5,,...,,"Jussi Adler-Olsen's THE SHADOW MURDERS, to Fan...",jussi adler-olsen,NONE,NONE,0.003922,0.014060,0.121864,0.860154,nh_white
3,64,International rights,Fiction,2023-03-01,NONE,IFOLGE LOVEN,"['Matthes & Seitz', 'Sophia Hersi Smith', 'Cop...",,24500.5,,...,,"Solvej Balle's IFOLGE LOVEN, to Matthes & Seit...",balle,NONE,NONE,0.003922,0.014060,0.121864,0.860154,nh_white
4,52,International rights,Fiction,2023-03-01,NONE,NONE,"['AST', 'Konstantin Palchikov', 'Sergei Chered...",,24500.5,,...,,Lois McMaster Bujold's World of the Five Gods ...,lois mcmaster bujold,NONE,NONE,0.003922,0.014060,0.121864,0.860154,nh_white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31912,168736,Non-fiction,Business/ Finance/ Economics,2008-01-01,Stephen Lundin,CATS!,"['Herb Schaffner', 'McGraw-Hill']",,74500.0,,...,,FISH! author Stephen Lundin's CATS!: The Nine ...,stephen lundin,Stephen,Lundin,0.005477,0.026408,0.040033,0.928082,nh_white
31913,168732,Non-fiction,Humor,2008-01-01,Sharon Nichols,I JUDGE YOU WHEN YOU USE POOR GRAMMAR,"['Daniela Rapp', 'Griffin', 'Neil Salkind', 'S...",,24500.5,,...,,Sharon Nichols's I JUDGE YOU WHEN YOU USE POOR...,sharon nichols,Sharon,Nichols,0.000906,0.005719,0.127225,0.866150,nh_white
31914,168725,Non-fiction,Reference,2008-01-01,Sage Cohen,THE WRITING THE LIFE POETIC,"['Jane Friedman', ""Writer's Digest Books"", 'Ma...",,24500.5,,...,,"Sage Cohen's THE WRITING THE LIFE POETIC, an e...",sage cohen,Sage,Cohen,0.023355,0.045515,0.142626,0.788503,nh_white
31915,168759,Fiction,Romance,2008-01-01,Cindy Procter-King,KISS ME AT MIDNIGHT,"['Theresa Stevens', 'Red Sage']",,24500.5,,...,,Cindy Procter-King writing as Kate St. James's...,kate st. james,Cindy,Procter-King,0.005292,0.018177,0.047628,0.928903,nh_white


In [106]:
adv_eth.groupby('Date',as_index=False).agg({'Advance':'sum'})

Unnamed: 0,Date,Advance
0,2008-01-01,14767049.0
1,2008-02-01,17042550.0
2,2008-03-01,13743552.5
3,2008-04-01,23750540.5
4,2008-05-01,16384054.0
...,...,...
178,2022-11-01,18706584.0
179,2022-12-01,16285584.0
180,2023-01-01,18388602.0
181,2023-02-01,18559107.5


In [99]:
adv_eth.to_csv('advances_with_ethnicity.csv')

In [74]:
import gender_guesser.detector as gender

d = gender.Detector(case_sensitive=False)
f1['gender'] = f1['First Name'].apply(lambda x : d.get_gender(x))

In [75]:
f1.to_csv('sales_adv_book_title_merge.csv')

In [77]:
sales['datepublished'] = pd.to_datetime(sales['datepublished']).dt.strftime('%Y-%m')

In [89]:
sales = sales.groupby(['isbn13','datepublished'], as_index=False).agg({'totalrevenue':'sum', 'totalunits':'sum', 'price':'mean'})

In [91]:
sales['datepublished'] = pd.to_datetime(sales['datepublished'])

In [93]:
sales.to_csv('sales_data.csv')

In [118]:
salx = sales.groupby(['datepublished'], as_index=False).agg({'totalrevenue':'sum', 'totalunits':'sum', 'price':'mean'})

In [124]:
salx[(salx['datepublished']>='2000') & (salx['datepublished']<='2024')].to_csv('sales_by_month.csv')

In [132]:
adv['All'][0]

"Indian writer Kynpham Sing Nongkynrih's FUNERAL NIGHTS, an epic novel about the Khasi people of Northeast India, combining folklore, culture and history, and life and death, pitched as the 'Moby Dick of Meghalaya' and about a group of friends witnessing a unique and ancient six-day-long funeral ceremony, the last of its kind, to Stefan Tobler at And Other Stories, with Tara Tobler editing, in a nice deal, for publication in 2024, by Kanishka Gupta at Writer's Side (world ex South Asia)."

In [92]:
test3_1 = test3[~test3.isin(x)]

In [94]:
test3_1[test3_1['fuzz_ratio']>=90]

Unnamed: 0,advance_id,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,...,Series,All,Author_extracted,isbn13,author,title,datepublished,author_l,fuzz_ratio,title_extracted
9,4.0,International rights,Fiction,2023-03-15,NONE,THE SHADOW MURDERS,"['Fantasy Foundation', 'Ann Huang', 'The Grayh...",,24500.5,,...,,"Jussi Adler-Olsen's THE SHADOW MURDERS, to Fan...",jussi adler-olsen,9.781786e+12,Jussi Adler-Olsen,The Shadow Murders,NaT,jussi adler-olsen,100.0,THE SHADOW MURDERS
13,42782.0,International rights,Fiction,2020-02-18,NONE,VICTIM 2117,"['Fantasy Foundation', 'Ping Chang', 'The Gray...",,24500.5,,...,,"Jussi Adler-Olsen's VICTIM 2117, to Fantasy Fo...",jussi adler-olsen,9.781786e+12,Jussi Adler-Olsen,Victim 2117,2020-03-05,jussi adler-olsen,100.0,VICTIM
60,84323.0,Fiction,General/ Other,2016-10-25,Leigh Bardugo,NINTH HOUSE,"['Noah Eaker', 'Flatiron Books', 'Joanna Volpe...",,625000.0,,...,,NYT bestselling author of SIX OF CROWS and CRO...,leigh bardugo,9.781250e+12,Leigh Bardugo,Ninth House,NaT,leigh bardugo,100.0,NYT SIX OF CROWS CROOKED KINGDOM NINTH HOUSE UK
181,137522.0,Fiction,Sci-Fi/ Fantasy,2011-11-20,Lois McMaster Bujold,CAPTAIN VORPATRIL'S ALLIANCE,"['Toni Weisskopf', 'Baen Books', 'Eleanor Wood...",,175000.0,,...,,Lois McMaster Bujold's CAPTAIN VORPATRIL'S ALL...,lois mcmaster bujold,9.781618e+12,Lois McMaster Bujold,Captain Vorpatril's Alliance,NaT,lois mcmaster bujold,100.0,CAPTAIN VORPATRIL ALLIANCE
188,47.0,Fiction,General/ Other,2023-03-14,Paolo Giordano,TASMANIA,"['Judith Gurewich', 'Other Press', 'Marleen Se...",,24500.5,,...,,"Paolo Giordano's TASMANIA, to Judith Gurewich ...",paolo giordano,9.788858e+12,Paolo Giordano,Tasmania,NaT,paolo giordano,100.0,TASMANIA NA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325038,168622.0,Non-fiction,Memoir,2008-01-13,Lynn Reardon,BEYOND THE HOMESTRETCH,"['Jason Gardner', 'New World Library', 'Elise ...",,24500.5,,...,,Lynn Reardon's BEYOND THE HOMESTRETCH: What I'...,lynn reardon,9.781577e+12,Lynn Reardon,Beyond the Homestretch,,lynn reardon,100.0,BEYOND THE HOMESTRETCH
325039,168622.0,Non-fiction,Memoir,2008-01-13,Lynn Reardon,BEYOND THE HOMESTRETCH,"['Jason Gardner', 'New World Library', 'Elise ...",,24500.5,,...,,Lynn Reardon's BEYOND THE HOMESTRETCH: What I'...,lynn reardon,9.781577e+12,Lynn Reardon,Beyond the Homestretch,,lynn reardon,100.0,BEYOND THE HOMESTRETCH
325040,168635.0,International rights,UK Non-fiction,2008-01-11,Bill Giovannetti,HOW TO KEEP YOUR INNER MESS FROM TRASHING YOUR...,"['Tony Collins', 'Lion Hudson Books', 'Janet K...",,24500.5,,...,,Bill Giovannetti's HOW TO KEEP YOUR INNER MESS...,bill giovannetti,9.781854e+12,Bill Giovannetti,How to Keep Your Inner Mess from Trashing Your...,,bill giovannetti,100.0,HOW TO KEEP YOUR INNER MESS FROM TRASHING YOUR...
325047,168703.0,Fiction,General/ Other,2008-01-09,Mary McGarry Morris,THE LAST SECRET,"['John Glusman', 'Shaye Areheart Books', 'Jean...",,625000.0,,...,,NYT bestselling author of Songs in Ordinary Ti...,mary mcgarry morris,9.780307e+12,Mary McGarry Morris,The Last Secret,,mary mcgarry morris,100.0,NYT THE LAST SECRET NA


In [77]:
test33['datepublished'] = pd.to_datetime(test33['datepublished']).dt.date
test33['Date'] = pd.to_datetime(test33['Date']).dt.date

test33['diff_days'] = test33.apply(lambda x : (x['datepublished'] - x['Date']).days,axis=1)
#test33[abs(diff.dt.days) <= pd.Timedelta(days=1460)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test33['datepublished'] = pd.to_datetime(test33['datepublished']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test33['Date'] = pd.to_datetime(test33['Date']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test33['diff_days'] = test33.apply(lambda x : (x['datepublished'] - x['Date

In [85]:
x = test33
y = x.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [86]:
y[y['fuzz_ratio']>=95]

Unnamed: 0,advance_id,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,...,All,Author_extracted,isbn13,author,title,datepublished,author_l,fuzz_ratio,title_extracted,diff_days
81,1099,Children's,Graphic Novel,2023-02-18,Gibson & Rori,WUTHERING HEIGHTS,"['Hannah Dussold', 'Andrews McMeel', 'Janna Mo...",,24500.5,,...,Writer and artist team Gibson & Rori's WUTHERI...,emily bronte,9.781505e+12,Emily Bronte,Wuthering Heights,2020-12-04,emily bronte,100,WUTHERING HEIGHTS,-806
156,2564,International rights,Children's,2023-01-16,NONE,THE LION OF MARS,"['Khai Minh', 'Clare Chi', 'The Grayhawk Agency']",,24500.5,,...,"Jennifer L. Holm's THE LION OF MARS, to Khai M...",jennifer l. holm,9.780593e+12,Jennifer L. Holm,The Lion of Mars,2021-01-05,jennifer l. holm,100,THE LION OF MARS,-741
294,4829,Audio rights,NONE,2022-11-09,Carrie Aarons,WHEN STARS BURN OUT,"['Julie Constantine', 'Podium Audio']",,24500.5,,...,"Carrie Aarons's WHEN STARS BURN OUT, to Julie ...",carrie aarons,9.798712e+12,Carrie Aarons,When Stars Burn Out,2021-02-20,carrie aarons,100,WHEN STARS BURN OUT,-627
400,6412,International rights,Fiction,2022-10-05,NONE,THE SUMMER OF BROKEN RULES,"[""Beatriz D'Oliveira"", 'Rocco', 'Anna Luiza Ca...",,24500.5,YES,...,"K. L. Walther's THE SUMMER OF BROKEN RULES, to...",k. l. walther,9.781728e+12,K. L. Walther,The Summer of Broken Rules,2021-05-04,k. l. walther,100,THE SUMMER OF BROKEN RULES,-519
503,7892,Paperback rights,NONE,2022-08-22,TJ Klune,THE BONES BENEATH MY SKIN,"['Ali Fisher', 'Tor', 'Deidre Knight', 'The Kn...",,625000.0,,...,NYT-bestselling and Lambda Literary Award-winn...,tj klune,9.781732e+12,TJ Klune,The Bones Beneath My Skin,2018-09-27,tj klune,100,NYT TJ THE BONES BENEATH MY SKIN NA,-1425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11140,164807,Film rights,NONE,2008-07-22,Liza Palmer,CONVERSATIONS WITH THE FAT GIRL,"['Christy Fletcher', 'Fletcher & Parry']",,74500.0,,...,"Liza Palmer's CONVERSATIONS WITH THE FAT GIRL,...",liza palmer,9.781539e+12,Liza Palmer,Conversations with the Fat Girl,2019-08-06,liza palmer,100,CONVERSATIONS WITH THE FAT GIRL HBO ROME RWHS,4032
11195,165776,International rights,Fiction,2008-06-03,NONE,NONE,"['Rizzoli', 'Maura Solinas', 'Piergiorgio Nico...",,24500.5,,...,Italian rights to Jonathan Evison's ALL ABOUT ...,jonathan evison,9.781594e+12,Jonathan Evison,All About Lulu,2018-03-13,jonathan evison,100,ALL ABOUT LULU,3570
11290,167088,International rights,Non-fiction,2008-03-28,NONE,NONE,"['New Star', 'Gray Tan', 'Jia-Xi Books']",,24500.5,,...,Simplified Chinese rights to investigative jou...,daniel estulin,9.781634e+12,Daniel Estulin,The True Story of the Bilderberg Group,2018-04-05,daniel estulin,97,THE TRUE STORY OF THE BILDERBERGER GROUP,3660
11300,167246,Fiction,General/ Other,2008-03-20,T. Greenwood,TWO RIVERS,"['Peter Senftleben', 'Kensington', 'Henry Duno...",,24500.5,,...,"T. Greenwood's TWO RIVERS, about a good man wh...",t. greenwood,9.781497e+12,T. Greenwood,Two Rivers,2020-05-26,t. greenwood,100,TWO RIVERS,4450
