In [1]:
import pandas as pd

In [2]:
import pandas as pd

def get_printbook_sales_data():
    
    sales = pd.read_csv('sales/printbook_sales.csv')
    sales= sales[~sales['isbn13'].isna()]
    sales = sales[~sales['datepublished'].isna()]
    sales.reset_index(drop=True, inplace=True)
    sales['datepublished']  = pd.to_datetime(sales['datepublished'], errors = 'coerce')
    #sales.rename(columns={'datepublished' : 'datepublished_sales_data'})
    sales = sales[['totalunits', 'price', 'totalrevenue', 'isbn13', 'salesdate','datepublished']]
    
    #This is monthly revenue of books. Aggregating revenue by each book
    sales = sales.groupby(['isbn13','datepublished'], as_index=False).agg({'totalrevenue':'sum', 'totalunits':'sum', 'price':'mean'})
    
    return sales

def get_advances_data():
    
    adv = pd.read_excel('author_advance_dataset.xlsx', index_col=0)
    adv = adv[~adv['Advance'].isna()]
    deals = {
    'NICE' : (1+49000)/2,
    'VERY NICE' : (50000 + 99000)/2,
    'GOOD' : (100000 + 250000)/2,
    'SIGNIFICANT' : (251000 + 499000)/2,
    'MAJOR' : (500000 + 750000)/2
    }
    adv['Advance'] = adv['Advance'].replace(deals)
    return adv

def get_printbook_title_data():
    
    book = pd.read_csv('books/booktitle_printbooks_new.csv')
    book = book[['author','title','isbn13', 'datepublished']]
    book['datepublished']  = pd.to_datetime(book['datepublished'], errors = 'coerce')
    #book.rename(columns={'datepublished' : 'datepublished_booktitle_data'})
    
    #Get only the first published date for a given isbn13
    book = book.sort_values('datepublished')
    book = book.groupby('isbn13').first().reset_index()
    
    return book

def get_merged_printbook_sales_and_booktitles():
    
    '''
    sales colums = ['totalunits', 'price', 'totalrevenue', 'isbn13', 'salesdate','datepublished']
    columns in printbook = ['author', 'title', 'isbn13', 'datepublished']
    '''
    sales = get_printbook_sales_data()
    book = get_printbook_title_data()
    merged_printbook_book = pd.merge(sales, book, on=['isbn13','datepublished'])
    return merged_printbook_book

def get_merged_advances_print_booktitles():
    
    '''
    columns in adv = ['Rights Category', 'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All']
       
    columns in printbook = ['author', 'title', 'isbn13', 'datepublished']
    '''
    adv = get_advances_data()
    book = get_printbook_title_data()
    merged_adv_print_book = pd.merge(book, adv , left_on ='author' ,right_on='Author(s)')
    
    #get only the rows whose difference between advance date and pubished date is more than 90 days
    merged_adv_print_book = merged_adv_print_book[(merged_adv_print_book['datepublished'] - merged_adv_print_book['Date']).dt.days >=90]
    return merged_adv_print_book

def get_full_merged_advances_printbook_sales():
    
    merged_printbooksales_printbooktitles = get_merged_printbook_sales_and_booktitles()
    merged_advances_printbooktitles = get_merged_advances_print_booktitles()
    full_merge = pd.merge(merged_printbooksales_printbooktitles,
                         merged_advances_printbooktitles,
                         on=['isbn13', 'datepublished'])
    return full_merge

def get_preprocessed_full_merge_advance_printbook_sales(full_merge):
    
    #Filter out on if the datepublished and advance date has more than 90 day difference
    full_merge = full_merge[(full_merge['datepublished'] - full_merge['Date']).dt.days >=90]

    
    #get rows for each isbn13
    for d in full_merge['isbn13']:
        fl = full_merge[full_merge['isbn13']==d]
        #check if there are multiple values for that isbn13
        if fl.shape[0] >1:
            #check if fiction is a rights category among those
            if (fl['Rights Category'] == 'Fiction').any():
                #Keep the fiction one and drop the other ones
                full_merge.drop(fl[fl['Rights Category'] != 'Fiction'].index,inplace=True)
    
    #Taking the max of right's category and keep that
    max_advance_idx = full_merge.groupby('isbn13')['Advance'].idxmax()
    full_merge = full_merge.loc[max_advance_idx]    
    
    return full_merge

def perform_Dr_Samsun_Strategy_of_adding_advances(full_merge):
    
    grouped_merge = full_merge.groupby('isbn13', as_index=False).agg({'Advance': 'sum'})
    full_merge = pd.merge(full_merge, grouped_merge, on='isbn13', how='left')
    full_merge = full_merge.rename(columns={'Advance_y':'advance_amount_sum', 'Advance_x': 'Advance'})
    return full_merge
        

In [3]:
full_merge = get_full_merged_advances_printbook_sales()

  sales = pd.read_csv('sales/printbook_sales.csv')
  book = pd.read_csv('books/booktitle_printbooks_new.csv')
  book = pd.read_csv('books/booktitle_printbooks_new.csv')


In [4]:
full_merge

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,...,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All
0,9.780007e+12,2019-09-24,27984,23,2234.078431,Demi Moore,Inside Out,Demi Moore,Inside Out,Non-fiction,...,"['Jonathan Burnham', 'Harper', 'Jennifer Barth...",Harper,625000.0,,,,,,,"Demi Moore's currently untitled book, chronicl..."
1,9.780007e+12,2020-03-05,2996384,1125,2536.134694,Hilary Mantel,The Mirror and the Light,Hilary Mantel,The Mirror and the Light,International rights,...,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...
2,9.780008e+12,2018-05-31,899,1,899.000000,Steven Camden,It’s About Love,Steven Camden,It’s About Love,International rights,...,"['Nick Lake', 'Harper UK', 'Cathryn Summerhaye...",Harper,375000.0,YES,,,,YES,,"Spoken word poet Steven Camden's TAPE, a debut..."
3,9.780008e+12,2017-07-27,0,0,1216.458333,Josh Malerman,Black Mad Wheel,Josh Malerman,Black Mad Wheel,Fiction,...,"['Michael Braff', 'Del Rey', 'Kristin Nelson',...",,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...
4,9.780008e+12,2017-07-27,0,0,1216.458333,Josh Malerman,Black Mad Wheel,Josh Malerman,Black Mad Wheel,Fiction,...,"['Lee Boudreaux', 'Ecco', 'Emma Coode', 'Voyag...",,375000.0,YES,,,,YES,,The lead singer/songwriter of the rock band Th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28061,9.798747e+12,2021-05-25,24681,19,1299.000000,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Audio rights,...,NONE,,24500.5,,,,,,,"Tonya Kappes's FIXIN' TO DIE, in the Kenni Low..."
28062,9.798747e+12,2021-05-25,24681,19,1299.000000,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Digital Fiction,...,"['Erin George', 'Henery Press']",,24500.5,,,YES,,,,USA Today bestselling author Tonya Kappes's ne...
28063,9.798747e+12,2021-05-01,97951,49,1999.000000,Nicole Williams,When All Else Fails,Nicole Williams,When All Else Fails,Children's,...,"['Phoebe Yeh', ""Crown Children's"", 'Jane Dyste...",,175000.0,,,,,,,"Nicole Williams's BET YOU CAN'T, featuring the..."
28064,9.798748e+12,2021-05-04,31200,24,1300.000000,Karina Halle,Bright Midnight: A Second-Chance Romance,Karina Halle,Bright Midnight: A Second-Chance Romance,Fiction,...,"['Latoya Smith', 'Grand Central', 'Scott Waxma...",,74500.0,,,,YES,,,"Karina Halle's Artists Trilogy, featuring the ..."


In [89]:
p = get_preprocessed_full_merge_advance_printbook_sales(full_merge)

In [90]:
p.shape

(14118, 24)

In [91]:
import pandas as pd
import re

def preprocess_names(df, Column_Name):
    #Remove space if the first letter of the word is a space
    
    df['temp_Column_Name'] = df[Column_Name]
    df['temp_Column_Name'] = df['temp_Column_Name'].str.lstrip()
    
    #Remove single letter in the Names
    #df[Column_Name] = df[Column_Name].str.replace(r'\b[A-Za-z]\b', '')
    df['temp_Column_Name'] = df['temp_Column_Name'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 1]))
    
    df['temp_Column_Name'] = df['temp_Column_Name'].apply(lambda x: re.sub(r'\b[A-Z]\.\s*', '', x))
    
    #Remove names in brackets
    #df[Column_Name] = df[Column_Name].str.replace(r'\s?\([^()]*\)', '')
    df['temp_Column_Name'] = df['temp_Column_Name'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
    
    rwords = ['Doctor','Professor','Captain','Jr','Sr','III','II', 'IV']
    for word in rwords:
        df.loc[:, 'temp_Column_Name'] = df['temp_Column_Name'].str.replace(word,"")
        
    df['temp_Column_Name'] = df['temp_Column_Name'].str.lstrip()
    
    df['First Name'] = df['temp_Column_Name'].str.split(" ", expand=True)[0]
    df['Last Name'] = df['temp_Column_Name'].str.split().str[-1]
    df.drop(columns=['temp_Column_Name'], inplace=True)
    return df

In [92]:
p.shape

(14118, 24)

In [93]:
dx = preprocess_names(p,'author_x')

In [94]:
dx.shape

(14118, 26)

In [101]:
import gender_guesser.detector as gender

d = gender.Detector(case_sensitive=False)
dx['gender'] = dx['First Name'].apply(lambda x : d.get_gender(x))

In [112]:
dg = dx.copy()
dg1 = dx.copy()

In [116]:
from ethnicolr import pred_fl_reg_name

px = pred_fl_reg_name(dg,'Last Name', 'First Name')
dg1 = dg1.merge(px, on=['Last Name', 'First Name'])

KeyError: 'gender'

In [120]:
final_df = dg1[['isbn13_x', 'datepublished_x', 'totalrevenue_x',
               'totalunits_x','price_x', 'author_x_x', 'title_x_x',
               'Rights Category_x', 'Genre_x', 'Date_x', 'Author(s)_x', 'Title_x',
                'Publishers_x', 'Big Publishing House Affilation_x', 'Advance_x',
               'Competition_x', 'Awards_x', 'Bestseller_x', 'Self Publishing_x',
               'Debut_x', 'Series_x', 'All_x', 'First Name', 'Last Name', 'gender_x',
                'asian', 'hispanic', 'nh_black', 'nh_white', 'race'
               ]]

In [124]:
final_df = final_df.rename(columns={
    'isbn13_x': 'isbn13',
    'datepublished_x': 'datepublished', 
    'totalrevenue_x' : 'totalrevenue',
    'totalunits_x':'totalunits',
    'price_x':'price',
    'author_x_x':'author',
    'title_x_x' : 'title',
    'Rights Category_x':'Rights Category', 
    'Genre_x':'Genre', 
    'Date_x': 'Date', 
    'Author(s)_x':'Author(s)',
    'Title_x':'Title',
    'Publishers_x':'Publishers', 
    'Big Publishing House Affilation_x':'Big Publishing House Affilation', 
    'Advance_x':'Advance',
    'Competition_x':'Competition', 
    'Awards_x':'Awards', 
    'Bestseller_x':'Bestseller', 
    'Self Publishing_x':'Self Publishing',
    'Debut_x':'Debut', 
    'Series_x':'Series', 
    'All_x':'All', 
    'gender_x':'gender'
})

In [126]:
final_df.to_csv('full_merge_advance_max_with_ethnicity_and_gender.csv')

In [71]:
eth.to_csv('full_merge_advance_max_with_ethnicity_and_gender.csv')

In [7]:
q= perform_Dr_Samsun_Strategy_of_adding_advances(full_merge)

In [11]:
filtered_q = q[q.groupby('isbn13')['isbn13'].transform('count') > 1]


In [23]:
k=0
n=20
z = pd.DataFrame(columns=q.columns)
for x in q['isbn13']:
    if q[q['isbn13'] ==x].shape[0] >1:
        z = pd.concat([z, q[q['isbn13'] ==x]])
        print(x)
        k+=1
    if k==n:
        break

9780007530090.0
9780007530090.0
9780008152352.0
9780008152352.0
9780008152420.0
9780008152420.0
9780008152420.0
9780008158828.0
9780008158828.0
9780008180065.0
9780008180065.0
9780008180065.0
9780008180089.0
9780008180089.0
9780008180089.0
9780008180119.0
9780008180119.0
9780008180119.0
9780008180119.0
9780008192211.0


In [32]:
z.to_csv('duplicate_isbn13_file.csv')

In [31]:
q[q['isbn13'] ==9780008158828.0].iloc[:,:17]

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance
18,9780008000000.0,2018-09-20,271785,213,1327.378698,Kiera Cass,Untitled Standalone Cass,Kiera Cass,Untitled Standalone Cass,Children's,Young Adult Fiction,2013-03-12,Kiera Cass,NONE,"['Erica Sussman', 'Harper Teen', 'Elana Roth',...",Harper,175000.0
19,9780008000000.0,2018-09-20,271785,213,1327.378698,Kiera Cass,Untitled Standalone Cass,Kiera Cass,Untitled Standalone Cass,Children's,Young Adult Fiction,2010-06-22,Kiera Cass,THE SELECTION,"['Erica Sussman', ""Harper Children's"", 'Elana ...",Harper,175000.0


In [92]:
p.to_csv('full_merge_fiction_max_advance_strategy.csv')

In [6]:
print(p.shape)
print(len(p['isbn13'].unique()))

(14118, 24)
14118


In [108]:
q= perform_Dr_Samsun_Strategy_of_adding_advances(full_merge)

In [91]:
q.to_csv('full_merge_advance_sum_strategy.csv')

In [102]:
q = q.rename(columns={'Advance_y':'advance_amount_sum'})

In [121]:
q[q['isbn13']==9780008152420.0].iloc[:,:17]

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance
14,9780008000000.0,2020-04-16,74553,55,1433.966851,Mark Lawrence,Holy Sister,Mark Lawrence,Holy Sister,Fiction,Sci-Fi/ Fantasy,2015-06-24,Mark Lawrence,RED SISTER,"['Diana Gill', 'Ace', 'Jane Johnson', 'Voyager...",,375000.0
15,9780008000000.0,2020-04-16,74553,55,1433.966851,Mark Lawrence,Holy Sister,Mark Lawrence,Holy Sister,Fiction,Sci-Fi/ Fantasy,2010-04-08,Mark Lawrence,PRINCE OF THORNS,"['Ginjer Buchanan', 'Roc', 'Ian Drury', 'Sheil...",Harper,74500.0


In [21]:
full_merge = full_merge[(full_merge['datepublished'] - full_merge['Date']).dt.days >=90]


In [59]:
print(full_merge.shape)
print(len(full_merge['isbn13'].unique()))

(24425, 24)
14118


In [25]:
k = 0
for d in full_merge['isbn13']:
    if full_merge[full_merge['isbn13']==d].shape[0] >2:
        print(d)
        k+=1
    if k==5:
        break

9780008152420.0
9780008152420.0
9780008152420.0
9780008180065.0
9780008180065.0


In [27]:
full_merge[full_merge['isbn13']==9780008152420.0].iloc[:, : 17]

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance
21,9780008000000.0,2020-04-16,74553,55,1433.966851,Mark Lawrence,Holy Sister,Mark Lawrence,Holy Sister,Digital Fiction,Thriller,2018-11-27,Mark Lawrence,ONE WORD KILL,"['Jack Butler', 'Thomas & Mercer', 'Ian Drury'...",,175000.0
22,9780008000000.0,2020-04-16,74553,55,1433.966851,Mark Lawrence,Holy Sister,Mark Lawrence,Holy Sister,Fiction,Sci-Fi/ Fantasy,2015-06-24,Mark Lawrence,RED SISTER,"['Diana Gill', 'Ace', 'Jane Johnson', 'Voyager...",,375000.0
23,9780008000000.0,2020-04-16,74553,55,1433.966851,Mark Lawrence,Holy Sister,Mark Lawrence,Holy Sister,Fiction,Sci-Fi/ Fantasy,2010-04-08,Mark Lawrence,PRINCE OF THORNS,"['Ginjer Buchanan', 'Roc', 'Ian Drury', 'Sheil...",Harper,74500.0


In [58]:
for d in full_merge['isbn13']:
    fl = full_merge[full_merge['isbn13']==d]
    if fl.shape[0] >1:
        if (fl['Rights Category'] == 'Fiction').any():
            full_merge.drop(fl[fl['Rights Category'] != 'Fiction'].index,inplace=True)
            
        

In [56]:
full_merge

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,...,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All
0,9.780007e+12,2019-09-24,27984,23,2234.078431,Demi Moore,Inside Out,Demi Moore,Inside Out,Non-fiction,...,"['Jonathan Burnham', 'Harper', 'Jennifer Barth...",Harper,625000.0,,,,,,,"Demi Moore's currently untitled book, chronicl..."
1,9.780007e+12,2020-03-05,2996384,1125,2536.134694,Hilary Mantel,The Mirror and the Light,Hilary Mantel,The Mirror and the Light,International rights,...,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...
2,9.780008e+12,2018-05-31,899,1,899.000000,Steven Camden,It’s About Love,Steven Camden,It’s About Love,International rights,...,"['Nick Lake', 'Harper UK', 'Cathryn Summerhaye...",Harper,375000.0,YES,,,,YES,,"Spoken word poet Steven Camden's TAPE, a debut..."
5,9.780008e+12,2017-07-27,0,0,1216.458333,Josh Malerman,Black Mad Wheel,Josh Malerman,Black Mad Wheel,Fiction,...,"['Michael Braff', 'Del Rey', 'Kristin Nelson',...",,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...
6,9.780008e+12,2017-07-27,0,0,1216.458333,Josh Malerman,Black Mad Wheel,Josh Malerman,Black Mad Wheel,Fiction,...,"['Lee Boudreaux', 'Ecco', 'Emma Coode', 'Voyag...",,375000.0,YES,,,,YES,,The lead singer/songwriter of the rock band Th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36745,9.798747e+12,2021-05-25,24681,19,1299.000000,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Audio rights,...,NONE,,24500.5,,,,,,,"Tonya Kappes's FIXIN' TO DIE, in the Kenni Low..."
36746,9.798747e+12,2021-05-25,24681,19,1299.000000,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Tonya Kappes,Post Mortem: A Mail Carrier Cozy Mystery Book 6,Digital Fiction,...,"['Erin George', 'Henery Press']",,24500.5,,,YES,,,,USA Today bestselling author Tonya Kappes's ne...
36747,9.798747e+12,2021-05-01,97951,49,1999.000000,Nicole Williams,When All Else Fails,Nicole Williams,When All Else Fails,Children's,...,"['Phoebe Yeh', ""Crown Children's"", 'Jane Dyste...",,175000.0,,,,,,,"Nicole Williams's BET YOU CAN'T, featuring the..."
36748,9.798748e+12,2021-05-04,31200,24,1300.000000,Karina Halle,Bright Midnight: A Second-Chance Romance,Karina Halle,Bright Midnight: A Second-Chance Romance,Fiction,...,"['Latoya Smith', 'Grand Central', 'Scott Waxma...",,74500.0,,,,YES,,,"Karina Halle's Artists Trilogy, featuring the ..."


In [60]:
full_merge[full_merge['isbn13']==9780008180065.0].iloc[:, : 17]

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance
35,9780008000000.0,2018-06-18,19534,22,886.92,Jay Kristoff,Godsgrave (The Nevernight Chronicle),Jay Kristoff,Godsgrave (The Nevernight Chronicle),Fiction,Sci-Fi/ Fantasy,2014-09-19,Jay Kristoff,NONE,"['Peter Wolverton', 'Thomas Dunne Books', 'Mat...",,175000.0
36,9780008000000.0,2018-06-18,19534,22,886.92,Jay Kristoff,Godsgrave (The Nevernight Chronicle),Jay Kristoff,Godsgrave (The Nevernight Chronicle),Fiction,Sci-Fi/ Fantasy,2011-01-26,Jay Kristoff,STORMDANCER,"['Peter Wolverton', 'Thomas Dunne Books', 'Mat...",,175000.0


In [53]:
x = full_merge[full_merge['isbn13']==9780008180065.0].iloc[:, : 17]

In [54]:
x

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author_x,title_x,author_y,title_y,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance
34,9780008000000.0,2018-06-18,19534,22,886.92,Jay Kristoff,Godsgrave (The Nevernight Chronicle),Jay Kristoff,Godsgrave (The Nevernight Chronicle),Children's,Young Adult Fiction,2016-03-08,Jay Kristoff,LIFEL1K3,"['Melanie Cecka Nolan', ""Knopf Children's"", 'J...",,375000.0
35,9780008000000.0,2018-06-18,19534,22,886.92,Jay Kristoff,Godsgrave (The Nevernight Chronicle),Jay Kristoff,Godsgrave (The Nevernight Chronicle),Fiction,Sci-Fi/ Fantasy,2014-09-19,Jay Kristoff,NONE,"['Peter Wolverton', 'Thomas Dunne Books', 'Mat...",,175000.0
36,9780008000000.0,2018-06-18,19534,22,886.92,Jay Kristoff,Godsgrave (The Nevernight Chronicle),Jay Kristoff,Godsgrave (The Nevernight Chronicle),Fiction,Sci-Fi/ Fantasy,2011-01-26,Jay Kristoff,STORMDANCER,"['Peter Wolverton', 'Thomas Dunne Books', 'Mat...",,175000.0


In [17]:
sales = get_printbook_sales_data()

  sales = pd.read_csv('sales/printbook_sales.csv')


In [21]:
print(len(sales['isbn13']), len(sales['isbn13'].unique()))
print(len(sales['datepublished']), len(sales['datepublished'].unique()))

41445107 194035
41445107 6802


In [25]:
xsales =sales.groupby(['isbn13','datepublished'], as_index=False).agg({'totalrevenue':'sum', 'totalunits':'sum', 'price':'mean'})

In [30]:
adv = get_advances_data()

In [57]:
print(adv.shape)
print(len(adv['Author(s)'].unique()))

(34297, 15)
17796


In [59]:
adv[adv['Author(s)'] =='Josh Malerman']

Unnamed: 0,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All
45936,Fiction,General/ Other,2019-11-11,Josh Malerman,"['SPIN A BLACK YARN', 'GOBLIN', 'A HOUSE AT TH...","['Tricia Narwani', 'Del Rey', 'Kristin Nelson'...",,175000.0,,,YES,,,,NYT-bestselling author of BIRD BOX Josh Malerm...
54187,Fiction,Thriller,2019-03-18,Josh Malerman,"['MALORIE', 'MUSICAL CHAIRS']","['Tricia Narwani', 'Del Rey', 'Kristin Nelson'...",,625000.0,,,YES,,,YES,NYT-bestselling author of BIRD BOX Josh Malerm...
80059,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,"['Michael Braff', 'Del Rey', 'Kristin Nelson',...",,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...
130491,Fiction,Horror,2012-08-10,Josh Malerman,BIRD BOX,"['Lee Boudreaux', 'Ecco', 'Emma Coode', 'Voyag...",,375000.0,YES,,,,YES,,The lead singer/songwriter of the rock band Th...


In [4]:
book = get_printbook_title_data()

  book = pd.read_csv('books/booktitle_printbooks_new.csv')


In [14]:
book_sorted = book.sort_values('datepublished')
book_filtered = book_sorted.groupby('isbn13').first().reset_index()

In [60]:
book_filtered[book_filtered['author'] =='Josh Malerman']

Unnamed: 0,isbn13,author,title,datepublished
41,9780008000000.0,Josh Malerman,Black Mad Wheel,2017-07-27
9873,9780399000000.0,Josh Malerman,Unbury Carol: A Novel,2018-04-10
9874,9780399000000.0,Josh Malerman,Unbury Carol: A Novel,2019-02-05
16173,9780593000000.0,Josh Malerman,Malorie: A Novel,2020-07-21
16540,9780593000000.0,Josh Malerman,A House at the Bottom of a Lake,2021-01-19
16541,9780593000000.0,Josh Malerman,Goblin: A Novel in Six Novellas,2021-05-18
48352,9781409000000.0,Josh Malerman,Malorie: The much-anticipated Bird Box sequel,2019-10-01
49968,9781433000000.0,Josh Malerman,Unbury Carol (Thorndike Press Large Print Bill...,2018-10-03
63253,9781525000000.0,Josh Malerman,Inspection: A Novel,2019-03-19
63254,9781525000000.0,Josh Malerman,Inspection: A Novel,2019-09-17


In [15]:
print(book.shape, len(book['isbn13'].unique()), len(book_filtered['isbn13']))

(190400, 4) 189725 189724


In [135]:
print(sales.shape, adv.shape, book.shape)

(41445107, 6) (34297, 15) (190400, 4)


In [26]:
merged_sales_book = pd.merge(xsales, book_filtered, on=['isbn13', 'datepublished'])

In [29]:
print(merged_sales_book.shape)
print(len(merged_sales_book['isbn13'].unique()))
print(len(merged_sales_book['datepublished'].unique()))

(187320, 7)
187320
2152


In [28]:
merged_sales_book

#Should I add revenue of books published on different dates

Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price,author,title
0,9.780007e+12,2006-10-16,0,0,1427.000000,Philippa Gregory,Virgin Earth
1,9.780007e+12,2015-09-15,11184,16,699.000000,Stacy Gregg,Stardust and the Daredevil Ponies (Pony Club S...
2,9.780007e+12,2014-09-02,46773,67,697.647059,Stacy Gregg,Blaze and the Dark Rider (Pony Club Secrets) (...
3,9.780007e+12,2019-07-20,0,0,2000.000000,Starkey David,Monarchy: From the Middle Ages to Modernity
4,9.780007e+12,2015-09-29,3684,5,703.846154,Stacy Gregg,Comet and the Champion’s Cup (Pony Club Secret...
...,...,...,...,...,...,...,...
187315,9.798750e+12,2021-06-18,255813,64,4010.229508,Emily Attached,Mental Health Workbook: 6 Books in 1: The Atta...
187316,9.798750e+12,2021-05-06,0,0,932.333333,Hans Fallada,Little man - What now?
187317,9.798750e+12,2021-05-06,54298,34,1597.000000,Tasha Dixon,Dr. Sebi Smoothies Cleanse Book: The Approved ...
187318,9.798750e+12,2021-05-11,6996,6,1166.000000,Diana Robles Pérez,Vértigo al frío (Spanish Edition)


In [144]:
adv.columns

Index(['Rights Category', 'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All'],
      dtype='object')

In [206]:
merged_book_adv = pd.merge(book_filtered, adv, left_on ='author' ,right_on='Author(s)')

In [207]:
merged_book_adv = merged_book_adv[(merged_book_adv['datepublished'] -merged_book_adv['Date']).dt.days >=90 ]
#merged_book_adv = merged_book_adv[(merged_book_adv['datepublished'] -merged_book_adv['Date']).dt.days <=500 ]
#merged_book_adv = merged_book_adv[merged_book_adv['Title'] !='NONE']

In [208]:
merged_book_adv['title'] = merged_book_adv['title'].astype('str')
merged_book_adv['Title'] = merged_book_adv['Title'].astype('str')

In [209]:
merged_book_adv[merged_book_adv['isbn13'] ==9780007466085.0]
#merged_book_adv['isbn13'][20]

Unnamed: 0,isbn13,author,title,datepublished,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All
0,9780007000000.0,Demi Moore,Inside Out,2019-09-24,Non-fiction,Memoir,2010-06-08,Demi Moore,NONE,"['Jonathan Burnham', 'Harper', 'Jennifer Barth...",Harper,625000.0,,,,,,,"Demi Moore's currently untitled book, chronicl..."


In [210]:
merged_book_adv[merged_book_adv['isbn13'] ==9780593156858.0]
merged_book_adv[merged_book_adv['isbn13'] ==9780008363710.0]

Unnamed: 0,isbn13,author,title,datepublished,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All
415,9780008000000.0,Robin Hobb,"Assassin’s Apprentice (The Farseer Trilogy, Bo...",2019-10-03,Audio rights,NONE,2014-03-19,Robin Hobb,NONE,NONE,,24500.5,,,,,,,"Robin Hobb's TAWNY MAN trilogy, part of her Re..."
416,9780008000000.0,Robin Hobb,"Assassin’s Apprentice (The Farseer Trilogy, Bo...",2019-10-03,Fiction,Sci-Fi/ Fantasy,2014-03-15,Robin Hobb,FITZ AND THE FOOL,"['Anne Groell', 'Del Rey', 'Chris Lotts', 'The...",,625000.0,,,,,,,"Robin Hobb's FITZ AND THE FOOL trilogy, revivi..."
417,9780008000000.0,Robin Hobb,"Assassin’s Apprentice (The Farseer Trilogy, Bo...",2019-10-03,International rights,UK Fiction,2013-10-17,Robin Hobb,FITZ AND THE FOOL,"['Jane Johnson', 'Harper UK', 'Chris Lotts', '...",Harper,625000.0,,,,,,,"Robin Hobb's new FITZ AND THE FOOL trilogy, re..."


In [205]:
print(merged_book_adv.shape)
print(len(merged_book_adv['isbn13'].unique()))

(20670, 19)
12287


In [101]:
from fuzzywuzzy import fuzz

merged_book_adv['title_match_ratio'] = merged_book_adv.apply(lambda row : 
                                                            fuzz.ratio(row['title'],row['Title']), axis=1)



In [105]:
max_scores = merged_book_adv.groupby('isbn13')['title_match_ratio'].max().reset_index()

# Merge with the original dataframe to retain the corresponding rows
df_filtered = pd.merge(merged_book_adv, max_scores, on=['isbn13', 'title_match_ratio'], how='inner')

In [121]:
merged_book_adv['title_match_ratio'] = merged_book_adv.apply(lambda row : 
                                                            fuzz.partial_ratio(row['title'],row['Title']), axis=1)

In [122]:
max_scores = merged_book_adv.groupby('isbn13')['title_match_ratio'].max().reset_index()

# Merge with the original dataframe to retain the corresponding rows
df_filtered = pd.merge(merged_book_adv, max_scores, on=['isbn13', 'title_match_ratio'], how='inner')

In [136]:
merged_book_adv['title_match_ratio'] = merged_book_adv.apply(lambda row : 
                                                            fuzz.token_sort_ratio(row['title'],row['Title']), axis=1)

In [137]:
max_scores = merged_book_adv.groupby('isbn13')['title_match_ratio'].max().reset_index()

# Merge with the original dataframe to retain the corresponding rows
df_filtered = pd.merge(merged_book_adv, max_scores, on=['isbn13', 'title_match_ratio'], how='inner')

In [165]:
df_filtered = df_filtered[df_filtered['Title'] != 'NONE']

In [160]:
max_scores = merged_book_adv.groupby('isbn13')['title_match_ratio'].max().reset_index()

# Merge with the original dataframe to retain the corresponding rows
df_filtered = pd.merge(merged_book_adv, max_scores, on=['isbn13', 'title_match_ratio'], how='inner')

In [183]:
print(df_filtered.shape)
print(len(df_filtered['isbn13'].unique()))

(11722, 20)
11722


In [184]:
df_filtered

Unnamed: 0,isbn13,author,title,datepublished,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,title_match_ratio
1,9.780007e+12,Hilary Mantel,The Mirror and the Light,2020-03-05,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,37
2,9.780008e+12,Hilary Mantel,Mantel Pieces: The New Book from The Sunday Ti...,2020-10-06,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,29
3,9.781251e+12,Hilary Mantel,"Wolf Hall: A Novel (Wolf Hall Trilogy, 1)",2021-05-04,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,29
4,9.781251e+12,Hilary Mantel,Bring Up the Bodies: A Novel (Wolf Hall Trilog...,2021-05-04,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,25
5,9.781433e+12,Hilary Mantel,The Mirror & the Light (Thorndike Press Large ...,2020-04-15,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15813,9.798709e+12,Melissa Moore,KETO DIET FOR WOMEN OVER 50: Learn the Healthi...,2021-02-14,International rights,Non-fiction,2010-12-20,Melissa Moore,SHATTERED SILENCE,"['Asmita Sathe', 'Mehta']",,24500.5,,,,,,,Melissa Moore's SHATTERED SILENCE: The Untold ...,8
15814,9.798712e+12,Michelle Johnson,The New Mediterranean Diet Cookbook 2021: A De...,2021-02-20,Audio rights,NONE,2014-10-01,Michelle Johnson,DIVINITY,NONE,,24500.5,,,,,,,"Michelle Johnson's DIVINITY, book one in The D...",7
15815,9.798713e+12,Jaime Salazar,Legion of the Lost: The true experience of an ...,2021-02-24,Non-fiction,History,2020-04-15,Jaime Salazar,MUTINY OF RAGE,"['Jake Bonar', 'Rowman & Littlefield', 'Letici...",,24500.5,,,,,,,Houston-based immigration lawyer Jaime Salazar...,17
15816,9.798730e+12,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...,2021-03-29,Fiction,Mystery/ Crime,2012-06-27,Christy Barritt,"['KEY WITNESS', 'LETHAL JUSTICE', 'FINAL JUSTI...","['Tina James', 'Love Inspired', 'Joyce Hart', ...",,24500.5,,,,,,,"Christy Barritt's KEY WITNESS, when a woman st...",27


In [180]:
k=0
for d in df_filtered['isbn13']:
    if df_filtered[df_filtered['isbn13'] == d].shape[0] >1:
        print(df_filtered[df_filtered['isbn13'] == d].index)
        df_filtered = df_filtered.drop(index=df_filtered[df_filtered['isbn13'] == d].index)


In [187]:
df_filtered[df_filtered['isbn13'] ==9780008245009.0]
df_filtered[df_filtered['title_match_ratio'] >=40]

Unnamed: 0,isbn13,author,title,datepublished,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,title_match_ratio
8,9.780399e+12,Josh Malerman,Unbury Carol: A Novel,2018-04-10,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,"['Michael Braff', 'Del Rey', 'Kristin Nelson',...",,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,75
9,9.780399e+12,Josh Malerman,Unbury Carol: A Novel,2019-02-05,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,"['Michael Braff', 'Del Rey', 'Kristin Nelson',...",,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,75
10,9.780593e+12,Josh Malerman,Malorie: A Novel,2020-07-21,Fiction,Thriller,2019-03-18,Josh Malerman,"['MALORIE', 'MUSICAL CHAIRS']","['Tricia Narwani', 'Del Rey', 'Kristin Nelson'...",,625000.0,,,YES,,,YES,NYT-bestselling author of BIRD BOX Josh Malerm...,59
11,9.780593e+12,Josh Malerman,A House at the Bottom of a Lake,2021-01-19,Fiction,General/ Other,2019-11-11,Josh Malerman,"['SPIN A BLACK YARN', 'GOBLIN', 'A HOUSE AT TH...","['Tricia Narwani', 'Del Rey', 'Kristin Nelson'...",,175000.0,,,YES,,,,NYT-bestselling author of BIRD BOX Josh Malerm...,71
13,9.781409e+12,Josh Malerman,Malorie: The much-anticipated Bird Box sequel,2019-10-01,Fiction,Thriller,2019-03-18,Josh Malerman,"['MALORIE', 'MUSICAL CHAIRS']","['Tricia Narwani', 'Del Rey', 'Kristin Nelson'...",,625000.0,,,YES,,,YES,NYT-bestselling author of BIRD BOX Josh Malerm...,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15785,9.798618e+12,Juliette Sobanet,Dancing with Paris (City of Light),2019-04-06,Digital Fiction,Romance,2013-01-18,Juliette Sobanet,MIDNIGHT TRAIN TO PARIS,"['Kelli Martin', 'Montlake Romance', 'Kevan Ly...",,24500.5,,,,,,,"Juliette Sobanet's MIDNIGHT TRAIN TO PARIS, th...",55
15786,9.798624e+12,Juliette Sobanet,Midnight Train to Paris (City of Light),2019-04-07,Digital Fiction,Romance,2013-01-18,Juliette Sobanet,MIDNIGHT TRAIN TO PARIS,"['Kelli Martin', 'Montlake Romance', 'Kevan Ly...",,24500.5,,,,,,,"Juliette Sobanet's MIDNIGHT TRAIN TO PARIS, th...",77
15789,9.798682e+12,Lynne Connolly,A Whisper of Treason (The Daring Dersinghams),2020-09-02,Digital Fiction,Romance,2018-06-12,Lynne Connolly,THE GIRL WITH THE PEARL PIN,"['Martin Biro', 'Lyrical Press', 'Jill Marsal'...",,24500.5,,,,,,,"Lynne Connolly's THE GIRL WITH THE PEARL PIN, ...",43
15795,9.798639e+12,Anju Gattani,Duty and Desire (Winds of Fire),2020-05-01,Digital Fiction,General,2017-06-26,Anju Gattani,"['ONCE AND FOR ALL', 'DUTY AND DESIRE', 'NEVER...","['Sharona Wilhelm', 'Scarsdale', 'Bob Diforio'...",,24500.5,,,,,,,"Anju Gattani's ONCE AND FOR ALL, DUTY AND DESI...",60


In [175]:
for i in range(100):
    x = df_filtered[df_filtered['title_match_ratio'] >=i].shape[0]
    y = len(df_filtered['isbn13'].unique())
    if x==y:
        print(i)
        break

In [185]:
df_filtered[df_filtered['author'] =='Hilary Mantel']

Unnamed: 0,isbn13,author,title,datepublished,Rights Category,Genre,Date,Author(s),Title,Publishers,Big Publishing House Affilation,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,title_match_ratio
1,9780007000000.0,Hilary Mantel,The Mirror and the Light,2020-03-05,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,37
2,9780008000000.0,Hilary Mantel,Mantel Pieces: The New Book from The Sunday Ti...,2020-10-06,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,29
3,9781251000000.0,Hilary Mantel,"Wolf Hall: A Novel (Wolf Hall Trilogy, 1)",2021-05-04,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,29
4,9781251000000.0,Hilary Mantel,Bring Up the Bodies: A Novel (Wolf Hall Trilog...,2021-05-04,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,25
5,9781433000000.0,Hilary Mantel,The Mirror & the Light (Thorndike Press Large ...,2020-04-15,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",,24500.5,,,,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,16


In [134]:
fuzz.token_sort_ratio('Unbury Carol (Thorndike Press Large Print Bill',"UNBURY CAROL")

42

In [135]:
fuzz.token_sort_ratio('Unbury Carol (Thorndike Press Large Print Bill','BIRD BOX')

15

In [188]:
full_merge = pd.merge(merged_book_adv, merged_sales_book, on=['isbn13','datepublished'])

In [189]:
print(len(full_merge['isbn13'].unique()), len(full_merge['author_y'].unique()))

14118 3966


In [53]:
full_merge.shape

(28066, 24)

In [190]:
full_merge

Unnamed: 0,isbn13,author_x,title_x,datepublished,Rights Category,Genre,Date,Author(s),Title,Publishers,...,Self Publishing,Debut,Series,All,title_match_ratio,totalrevenue,totalunits,price,author_y,title_y
0,9.780007e+12,Demi Moore,Inside Out,2019-09-24,Non-fiction,Memoir,2010-06-08,Demi Moore,NONE,"['Jonathan Burnham', 'Harper', 'Jennifer Barth...",...,,,,"Demi Moore's currently untitled book, chronicl...",29,27984,23,2234.078431,Demi Moore,Inside Out
1,9.780007e+12,Hilary Mantel,The Mirror and the Light,2020-03-05,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",...,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,37,2996384,1125,2536.134694,Hilary Mantel,The Mirror and the Light
2,9.780008e+12,Hilary Mantel,Mantel Pieces: The New Book from The Sunday Ti...,2020-10-06,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",...,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,29,636407,323,1910.756863,Hilary Mantel,Mantel Pieces: The New Book from The Sunday Ti...
3,9.781251e+12,Hilary Mantel,"Wolf Hall: A Novel (Wolf Hall Trilogy, 1)",2021-05-04,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",...,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,29,23200,16,1630.033333,Hilary Mantel,"Wolf Hall: A Novel (Wolf Hall Trilogy, 1)"
4,9.781251e+12,Hilary Mantel,Bring Up the Bodies: A Novel (Wolf Hall Trilog...,2021-05-04,International rights,Fiction,2013-08-30,Hilary Mantel,A PLACE OF GREATER SAFETY,"['Sonia Draga', 'Agata Zabowska', 'ANAW Litera...",...,,,,Polish rights to Hilary Mantel's A PLACE OF GR...,25,6396,4,1659.767442,Hilary Mantel,Bring Up the Bodies: A Novel (Wolf Hall Trilog...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28061,9.798730e+12,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...,2021-03-29,Fiction,Mystery/ Crime,2012-06-27,Christy Barritt,"['KEY WITNESS', 'LETHAL JUSTICE', 'FINAL JUSTI...","['Tina James', 'Love Inspired', 'Joyce Hart', ...",...,,,,"Christy Barritt's KEY WITNESS, when a woman st...",27,67548,52,1299.000000,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...
28062,9.798730e+12,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...,2021-03-29,Fiction,Romance,2010-10-07,Christy Barritt,THE LAST TARGET,"['Tina James', 'Steeple Hill', 'Joyce Hart', '...",...,,,,"Christy Barritt's THE LAST TARGET, in which a ...",22,67548,52,1299.000000,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...
28063,9.798730e+12,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...,2021-03-29,Fiction,Inspirational,2010-02-25,Christy Barritt,UNDER FIRE,"['Tina James', 'Steeple Hill', 'Joyce Hart', '...",...,,,,"Christy Barritt's UNDER FIRE, as a woman's sta...",11,67548,52,1299.000000,Christy Barritt,Rocco: a clean romantic suspense novel (Lanter...
28064,9.798734e+12,Yuri Elkaim,The Strong60: Become The Person You're Capable...,2021-04-16,Non-fiction,Diet,2014-10-21,Yuri Elkaim,NONE,"['Jennifer Levesque', 'Rodale', 'Celeste Fine'...",...,,,YES,NYT bestselling author of The All-Day Energy D...,13,1895,1,1895.000000,Yuri Elkaim,The Strong60: Become The Person You're Capable...


In [159]:
full_merge_90_days = full_merge[(full_merge['datepublished'] - full_merge['Date']).dt.days >=90]

In [164]:
full_merge_90_days.to_csv('xxxx.csv')

In [174]:
temp = full_merge_90_days[['totalrevenue', 'Advance','datepublished','Date','author_x','isbn13']]

In [204]:
tempx =temp.groupby(['datepublished','Date','author_x','isbn13'], as_index=False).agg({'totalrevenue':'sum', 'Advance':'mean'})

In [205]:
tempx.to_csv('grouped_full_merged_printbook.csv')

In [200]:
len(tempx['isbn13'].unique())

14118