In [2]:
from fuzzywuzzy import fuzz
import numpy as np
#import dask.dataframe as dd
import re

In [3]:
import pandas as pd

#import dask.dataframe as dd

def get_printbook_sales_data_with_dask():
    
    sales = pd.read_csv('sales/printbook_sales.csv', blocksize=10000)
    sales= sales[~sales['isbn13'].isna()]
    sales = sales[~sales['datepublished'].isna()]
    sales = sales.compute().reset_index(drop=True)
    sales['datepublished']  = pd.to_datetime(sales['datepublished'], errors = 'coerce')
    #sales.rename(columns={'datepublished' : 'datepublished_sales_data'})
    sales = sales[['totalunits', 'price', 'totalrevenue', 'isbn13', 'salesdate','datepublished']]
    
    #This is monthly revenue of books. Aggregating revenue by each book
    sales = sales.groupby(['isbn13','datepublished'], as_index=False).agg({'totalrevenue':'sum', 'totalunits':'sum', 'price':'mean'})
    
    return sales

def preprocess_original_sales_data(salesdata_path):
    #salesdata_path for printbook is = sales/printbook_sales.csv
    #salesdata_path for ebook is = sales/ebook_sales.csv
    sales = pd.read_csv(salesdata_path)
    sales= sales[~sales['isbn13'].isna()]
    sales = sales[~sales['datepublished'].isna()]
    sales.reset_index(drop=True, inplace=True)
    sales['datepublished']  = pd.to_datetime(sales['datepublished'], errors = 'coerce')
    #sales.rename(columns={'datepublished' : 'datepublished_sales_data'})
    sales = sales[['totalunits', 'price', 'totalrevenue', 'isbn13', 'salesdate','datepublished']]
    
    #This is monthly revenue of books. Aggregating revenue by each book
    sales = sales.groupby(['isbn13','datepublished'], as_index=False).agg({'totalrevenue':'sum', 'totalunits':'sum', 'price':'mean'})
    
    return sales

def get_printbook_sales_data():
    
    sales = pd.read_csv('sales/Sales_aggregate_data.csv')
    
    return sales

def get_advances_data():
    
    adv = pd.read_excel('author_advance_dataset.xlsx', index_col=0)
    adv = adv[~adv['Advance'].isna()]
    deals = {
    'NICE' : (1+49000)/2,
    'VERY NICE' : (50000 + 99000)/2,
    'GOOD' : (100000 + 250000)/2,
    'SIGNIFICANT' : (251000 + 499000)/2,
    'MAJOR' : (500000 + 750000)/2
    }
    adv['Advance'] = adv['Advance'].replace(deals)

    adv.reset_index(inplace=True)
    adv.rename(columns={'index': 'advance_id'}, inplace=True) 
    return adv

def get_printbook_title_data():
    
    book = pd.read_csv('books/booktitle_printbooks_new.csv')
    book = book[['author','title','isbn13', 'datepublished']]
    book['datepublished']  = pd.to_datetime(book['datepublished'], errors = 'coerce')
    #book.rename(columns={'datepublished' : 'datepublished_booktitle_data'})
    
    #Get only the first published date for a given isbn13
    book = book.sort_values('datepublished')
    book = book.groupby('isbn13').first().reset_index()
    
    return book

def get_Ebook_title_data():
    
    book = pd.read_csv('books/booktitle_ebook_new.csv')
    book = book[['author','title','isbn13', 'datepublished']]
    book['datepublished']  = pd.to_datetime(book['datepublished'], errors = 'coerce')
    #book.rename(columns={'datepublished' : 'datepublished_booktitle_data'})
    
    #Get only the first published date for a given isbn13
    book = book.sort_values('datepublished')
    book = book.groupby('isbn13').first().reset_index()
    
    return book

def get_merged_printbook_sales_and_booktitles():
    
    '''
    sales colums = ['totalunits', 'price', 'totalrevenue', 'isbn13', 'salesdate','datepublished']
    columns in printbook = ['author', 'title', 'isbn13', 'datepublished']
    '''
    sales = get_printbook_sales_data()
    book = get_printbook_title_data()
    merged_printbook_book = pd.merge(sales, book, on=['isbn13','datepublished'])
    return merged_printbook_book

def get_merged_advances_print_booktitles():
    
    '''
    columns in adv = ['Rights Category', 'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All']
       
    columns in printbook = ['author', 'title', 'isbn13', 'datepublished']
    '''
    adv = get_advances_data()
    book = get_printbook_title_data()
    merged_adv_print_book = pd.merge(book, adv , left_on ='author' ,right_on='Author(s)')
    
    #get only the rows whose difference between advance date and pubished date is more than 90 days
    merged_adv_print_book = merged_adv_print_book[(merged_adv_print_book['datepublished'] - merged_adv_print_book['Date']).dt.days >=90]
    return merged_adv_print_book

def get_merged_advances_Ebook_booktitles():
    
    '''
    columns in adv = ['Rights Category', 'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All']
       
    columns in printbook = ['author', 'title', 'isbn13', 'datepublished']
    '''
    adv = get_advances_data()
    book = get_Ebook_title_data()
    merged_adv_ebook = pd.merge(book, adv , left_on ='author' ,right_on='Author(s)')
    
    #get only the rows whose difference between advance date and pubished date is more than 90 days
    merged_adv_ebook = merged_adv_ebook[(merged_adv_ebook['datepublished'] - merged_adv_ebook['Date']).dt.days >=90]
    return merged_adv_ebook

def get_full_merged_advances_printbook_sales():
    
    merged_printbooksales_printbooktitles = get_merged_printbook_sales_and_booktitles()
    merged_advances_printbooktitles = get_merged_advances_print_booktitles()
    full_merge = pd.merge(merged_printbooksales_printbooktitles,
                         merged_advances_printbooktitles,
                         on=['isbn13', 'datepublished'])
    return full_merge

def get_preprocessed_full_merge_advance_printbook_sales(full_merge):
    
    #Filter out on if the datepublished and advance date has more than 90 day difference
    full_merge = full_merge[(full_merge['datepublished'] - full_merge['Date']).dt.days >=90]

    
    #get rows for each isbn13
    for d in full_merge['isbn13']:
        fl = full_merge[full_merge['isbn13']==d]
        #check if there are multiple values for that isbn13
        if fl.shape[0] >1:
            #check if fiction is a rights category among those
            if (fl['Rights Category'] == 'Fiction').any():
                #Keep the fiction one and drop the other ones
                full_merge.drop(fl[fl['Rights Category'] != 'Fiction'].index,inplace=True)
    
    #Taking the max of right's category and keep that
    max_advance_idx = full_merge.groupby('isbn13')['Advance'].idxmax()
    full_merge = full_merge.loc[max_advance_idx]    
    
    return full_merge

def perform_Dr_Samsun_Strategy_of_adding_advances(full_merge):
    
    grouped_merge = full_merge.groupby('isbn13', as_index=False).agg({'Advance': 'sum'})
    full_merge = pd.merge(full_merge, grouped_merge, on='isbn13', how='left')
    full_merge = full_merge.rename(columns={'Advance_y':'advance_amount_sum', 'Advance_x': 'Advance'})
    return full_merge


def levenshtein_ratio(s1, s2):
    return fuzz.token_sort_ratio(s1, s2)

def preprocess_based_on_title_Levenshtien_distance(merge_book_adv):

    merge_book_adv['fuzz_ratio'] = merge_book_adv.apply(lambda x: levenshtein_ratio(str(x['title']).lower(), str(x['Title']).lower()), axis=1)
    #max_ratio=merge_book_adv.groupby('advance_id', as_index=False)['fuzz_ratio'].max()
    #df = pd.merge(max_ratio, merge_book_adv, on=['advance_id','fuzz_ratio'])

    return merge_book_adv

def pipeline(book_adv, sales):
    #book_adv = get_merged_advances_print_booktitles()
    #sales = pd.read_csv(sales_data_csv_path, index_col=0)
    #get only books which has date difference of greater than 90 days and less than 4 years
    book_adv = book_adv[((book_adv['datepublished'] - book_adv['Date']).dt.days >=90) & (((book_adv['datepublished'] - book_adv['Date']).dt.days <=1460))]

    #get Levenshtiens distances for title match
    book_adv['fuzz_ratio'] = book_adv.apply(lambda x: levenshtein_ratio(str(x['title']).lower(), str(x['Title']).lower()), axis=1)

    #get books with highest match on titles
    #book_adv = book_adv.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
    book_adv = book_adv.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
    #get books with least difference in publisheddate and advance date
    book_adv = book_adv.groupby('isbn13', as_index=False).apply(lambda x: x.loc[(x['datepublished'] - x['Date']).dt.days.idxmin()])
    book_adv['diff_days'] = (book_adv['datepublished'] - book_adv['Date']).dt.days
    print(f'After preprocessing of merged book_adv, the count is {book_adv.shape}')
    sales['datepublished'] = pd.to_datetime(sales['datepublished'])
    #sales['Date'] = pd.to_datetime(sales['Date'])
    #sales['diff_days'] = (sales['datepublished'] - sales['Date']).dt.days
    """
    full_merge = pd.merge(sales, book_adv , left_on=['isbn13','datepublished','Date','Title','Genre',
                                                     'Big Publishing House Affilation','Rights Category',
                                                     'Author(s)','Publishers','All'], 
                          right_on=['isbn13','datepublished','Date','Title',
                                    'Genre','Big Publishing House Affilation','Rights Category',
                                    'Author(s)','Publishers','All'])
    """
    full_merge = pd.merge(sales, book_adv, on=['isbn13','datepublished'])
    """
    full_merge = full_merge[['isbn13', 'datepublished', 'totalrevenue', 'totalunits', 'price',
        'Rights Category',
       'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Competition_x',
       'Awards_x', 'Bestseller_x', 'Self Publishing_x', 'Debut_x', 'Series_x',
       'All', 'advance_amount_sum', 'author', 'title', 'advance_id',
       'fuzz_ratio', 'diff_days','max_ratio']]
    full_merge = full_merge.rename(columns={
    'Competition_x': 'Competition' ,
    'Awards_x' : 'Awards', 
    'Bestseller_x': 'Bestseller', 
    'Self Publishing_x':'Self Publishing', 
    'Debut_x': 'Debut', 
    'Series_x': 'Series'


    })"""
    print(f"full_merge.shape - {full_merge.shape[0]} ,full_merge['isbn13'].unique() - {full_merge['isbn13'].unique().shape[0]}, full_merge['advance_id'].unique() -{full_merge['advance_id'].unique().shape[0]}  ")
    assert(full_merge.shape[0] == full_merge['isbn13'].unique().shape[0] == full_merge['advance_id'].unique().shape[0])
    return full_merge



def Author_fuzzy_match(adv, book):
    adv_data = adv['Author(s)'].unique()
    book_data = book['Author'].unique()

    data = []

    for ad in adv_data:
        st=''
        max_ratio = 0
        for bk in book_data:
            ratio = fuzz.ratio(ad, bk)
            if ratio > max_ratio:
                max_ratio = ratio
                st = bk
        data.append([st, ad, max_ratio])
    df = pd.DataFrame(data, columns=['author', 'Author(s)', 'max_ratio'])
    
    return df


def get_author_fuzzy_match_data_between_advances_and_booktitle():
    df = pd.read_csv('author_fuzzy_matches.csv', index_col=0)
    '''
    df = df.rename(columns={
    'author':'Author(s)',
    'Author(s)': 'author'
    })
    '''
    return df

In [4]:
def author_ethnicity(GSA):
    from ethnicolr import pred_fl_reg_name
    d = GSA.copy()
    dg = preprocess_names(d, 'author')
    d1 = dg.copy()
    eth = pred_fl_reg_name(dg,'Last Name', 'First Name')
    d1 = d1.merge(eth, on=[
           'First Name', 'Last Name'])

    d1 = d1.drop(d1.filter(like='_y').columns, axis=1)
    d1.columns = [col.replace('_x', '') for col in d1.columns]
    return d1

def Google_Author_sales_advance_merge():
    adv = get_advances_data()
    adv = adv[(adv['Author(s)'] != 'NONE')]
    adv_14 = adv[(adv['Date'] >='2014')]
    
    GAuthors = pd.read_csv('AuthorCrosswalkScraper/Google_Author_crosswalk_exact.csv', index_col=0)
    rwords = ['[',']','"',"'"]
    for word in rwords:
        GAuthors.loc[:, 'Author'] = GAuthors['Author'].str.replace(word,"")
    GAuthors = GAuthors[~GAuthors['PublishedDate'].isin(['17-1', '101-', '200?', '19??', '199?'])]
    GAuthors['PublishedDate'] = pd.to_datetime(GAuthors['PublishedDate'], format='%Y')
    G17 = GAuthors.groupby('Author', as_index=False).apply(lambda x: x.loc[x['PublishedDate'].idxmin()])#[GAuthors['PublishedDate'] >='2017']
    G17 = G17.rename(columns={
    'Author':'author',
    'Title':'title',
    'PublishedDate':'datepublished',
    'ISBN13':'isbn13'
    })
    
    Gsales = pd.merge(G17,sales,on='isbn13')
    GSA = pd.merge(Gsales, adv_14, left_on='author', right_on='Author(s)')
    GSA['fuzz_ratio'] = GSA.apply(lambda x: levenshtein_ratio(str(x['title']).lower(), str(x['Title']).lower()), axis=1)
    GSA = GSA.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
    GSA = GSA.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
    
    d1 = author_ethnicity(GSA)
    d1['diff_days'] = (d1['datepublished'] - d1['Date']).dt.days
    d1 = d1[(d1['diff_days'] >=90) & (d1['diff_days'] <=1460)]
    return d1
    

In [48]:
adv = get_advances_data()
book = get_printbook_title_data()
sales = pd.read_csv('sales/Sales_aggregate_data.csv')
book_adv = pd.merge(book, adv, left_on='author', right_on='Author(s)')

  book = pd.read_csv('books/booktitle_printbooks_new.csv')


In [75]:

book_adv['datepublished'] = pd.to_datetime(book_adv['datepublished'])
book_adv['diff_days'] = (book_adv['datepublished']-book_adv['Date']).dt.days
book_adv= book_adv[~book_adv['datepublished'].isna()]
book_adv = book_adv[(book_adv['diff_days'] >=90) & (book_adv['diff_days'] <=1460) ]
book_adv['fuzz_ratio'] = book_adv.apply(lambda x: levenshtein_ratio(str(x['title']).lower(), str(x['Title']).lower()), axis=1)
sales['datepublished'] = pd.to_datetime(sales['datepublished'])

In [76]:
print(book_adv.shape, book_adv['advance_id'].unique().shape, book_adv['isbn13'].unique().shape)

(11719, 22) (3600,) (8204,)


In [97]:
book_adv[book_adv['author'] == 'Josh Malerman']

Unnamed: 0,isbn13,author,title,datepublished,advance_id,Rights Category,Genre,Date,Author(s),Title,...,Advance,Competition,Awards,Bestseller,Self Publishing,Debut,Series,All,diff_days,fuzz_ratio
9,9780008000000.0,Josh Malerman,Black Mad Wheel,2017-07-27,80059,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,...,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,143,22
13,9780399000000.0,Josh Malerman,Unbury Carol: A Novel,2018-04-10,80059,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,...,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,400,75
17,9780399000000.0,Josh Malerman,Unbury Carol: A Novel,2019-02-05,80059,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,...,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,701,75
19,9780593000000.0,Josh Malerman,Malorie: A Novel,2020-07-21,45936,Fiction,General/ Other,2019-11-11,Josh Malerman,"['SPIN A BLACK YARN', 'GOBLIN', 'A HOUSE AT TH...",...,175000.0,,,YES,,,,NYT-bestselling author of BIRD BOX Josh Malerm...,253,28
20,9780593000000.0,Josh Malerman,Malorie: A Novel,2020-07-21,54187,Fiction,Thriller,2019-03-18,Josh Malerman,"['MALORIE', 'MUSICAL CHAIRS']",...,625000.0,,,YES,,,YES,NYT-bestselling author of BIRD BOX Josh Malerm...,491,59
21,9780593000000.0,Josh Malerman,Malorie: A Novel,2020-07-21,80059,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,...,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,1233,30
23,9780593000000.0,Josh Malerman,A House at the Bottom of a Lake,2021-01-19,45936,Fiction,General/ Other,2019-11-11,Josh Malerman,"['SPIN A BLACK YARN', 'GOBLIN', 'A HOUSE AT TH...",...,175000.0,,,YES,,,,NYT-bestselling author of BIRD BOX Josh Malerm...,435,71
24,9780593000000.0,Josh Malerman,A House at the Bottom of a Lake,2021-01-19,54187,Fiction,Thriller,2019-03-18,Josh Malerman,"['MALORIE', 'MUSICAL CHAIRS']",...,625000.0,,,YES,,,YES,NYT-bestselling author of BIRD BOX Josh Malerm...,673,30
25,9780593000000.0,Josh Malerman,A House at the Bottom of a Lake,2021-01-19,80059,Fiction,Sci-Fi/ Fantasy,2017-03-06,Josh Malerman,UNBURY CAROL,...,175000.0,YES,YES,,,,,Bram Stoker Award-nominee Josh Malerman's UNBU...,1415,19
27,9780593000000.0,Josh Malerman,Goblin: A Novel in Six Novellas,2021-05-18,45936,Fiction,General/ Other,2019-11-11,Josh Malerman,"['SPIN A BLACK YARN', 'GOBLIN', 'A HOUSE AT TH...",...,175000.0,,,YES,,,,NYT-bestselling author of BIRD BOX Josh Malerm...,554,40


In [98]:
final = book_adv.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
final = final.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['datepublished'].idxmin()])

In [99]:
print(final.shape, final['advance_id'].unique().shape, final['isbn13'].unique().shape)

(3141, 22) (3141,) (3141,)


In [116]:
ff = pd.merge(final[final['fuzz_ratio']>=1], sales, on=['isbn13','datepublished'])

In [117]:
print(ff.shape, ff['isbn13'].unique().shape)

(3127, 26) (3127,)


In [120]:
sb = pd.merge(sales, book, on=['isbn13','datepublished'])

In [121]:
print(sb.shape, sb['isbn13'].unique().shape)

(187320, 8) (187320,)


In [124]:
ab = pd.merge(book, adv, left_on='author', right_on='Author(s)')

In [133]:
sales.columns

Index(['Unnamed: 0', 'isbn13', 'datepublished', 'totalrevenue', 'totalunits',
       'price'],
      dtype='object')

In [138]:
print(ab.shape, ab['isbn13'].unique().shape, ab['advance_id'].unique().shape, ab['Author(s)'].unique().shape)

(37133, 20) (17275,) (7921,) (4765,)


In [136]:
sab =pd.merge(sales,ab,  on=['isbn13','datepublished'] )

In [140]:
print(sab.shape, sab['isbn13'].unique().shape, sab['advance_id'].unique().shape, sab['Author(s)'].unique().shape)

(36750, 24) (17066,) (7880,) (4743,)


In [142]:
sba = pd.merge(sb, adv, left_on='author', right_on='Author(s)')

In [143]:
print(sba.shape, sba['isbn13'].unique().shape, sba['advance_id'].unique().shape, sba['Author(s)'].unique().shape)

(36750, 24) (17066,) (7880,) (4743,)


In [144]:
sba['diff_days'] = (sba['datepublished']-sba['Date']).dt.days

In [148]:
fsba = sba[(sba['diff_days'] >=90) & (sba['diff_days'] <=1460)]

In [149]:
print(fsba.shape, fsba['isbn13'].unique().shape, fsba['advance_id'].unique().shape, fsba['Author(s)'].unique().shape)

(11685, 25) (8184,) (3590,) (2615,)


In [151]:
filtered_df = sba[sba[['isbn13', 'datepublished', 'advance_id']].apply(tuple, axis=1).duplicated(keep=False)]


In [154]:
print(book.shape, book['isbn13'].unique().shape, book['author'].unique().shape)

(189724, 4) (189724,) (93740,)


In [162]:
adv= adv[adv['Date'] >='2014']

In [163]:
print(adv.shape, adv['advance_id'].unique().shape, adv['Author(s)'].unique().shape)

(22839, 16) (22839,) (12076,)


In [158]:
print(sales.shape, sales['isbn13'].unique().shape)

(242495, 6) (194029,)


In [159]:
for s in sales['isbn13']:
    if sales[sales['isbn13']==s].shape[0] >1:
        print(s)
        break

9780007245161.0


In [160]:
sales[sales['isbn13']==9780007245161]

Unnamed: 0.1,Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price
3,3,9780007000000.0,2015-05-21,62910,112,555.664921
4,4,9780007000000.0,2015-09-15,11184,16,699.0
5,5,9780007000000.0,2020-08-20,20271,29,699.0


In [111]:
mbb = pd.merge(book_adv, sales, on=['isbn13','datepublished'])

In [20]:
dfg = pipeline(book_adv=book_adv, sales=sales)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_adv['fuzz_ratio'] = book_adv.apply(lambda x: levenshtein_ratio(str(x['title']).lower(), str(x['Title']).lower()), axis=1)


After preprocessing of merged book_adv, the count is (242, 22)
full_merge.shape - 240 ,full_merge['isbn13'].unique() - 240, full_merge['advance_id'].unique() -240  


In [47]:
#dfg[(dfg['fuzz_ratio']==100)]
(dfg['diff_days'].median())

657.5

In [11]:
xc[xc['Debut']=="YES"][['author','title','Title']mean

Unnamed: 0,author,title,Title
11,Joanna Glen,The Flight of Augusta Hope,THE FLIGHT OF AUGUSTA HOPE
32,Rebecca Podos,Like Water,THE MYSTERY OF HOLLOW PLACES
48,Kathryn Purdie,Crystal Blade (Burning Glass),AURASEER
49,Elizabeth Bonesteel,Breach of Containment: A Central Corps Novel,THE COLD BETWEEN
51,Elaine Vickers,Paper Chains,LOST AND FOUND
...,...,...,...
3042,J.D. Barker,"The Fourth Monkey - Geboren, um zu töten: Thri...",THE FOURTH MONKEY
3050,Ali Land,"Niña buena, niña mala / Good Me Bad Me (Spanis...","GOOD ME, BAD ME"
3087,AJ Vanderhorst,The Mostly Invisible Boy (Casey Grimes),THE MOSTLY INVISIBLE BOY
3088,Tom Deady,Of Men and Monsters,THE CLEARING


In [154]:
xg = Google_Author_sales_advance_merge()

In [166]:
xg.to_csv('gmerge.csv')

In [142]:
GAuthors = pd.read_csv('AuthorCrosswalkScraper/Google_Author_crosswalk_exact.csv', index_col=0)
rwords = ['[',']','"',"'"]
for word in rwords:
    GAuthors.loc[:, 'Author'] = GAuthors['Author'].str.replace(word,"")
GAuthors = GAuthors[~GAuthors['PublishedDate'].isin(['17-1', '101-', '200?', '19??', '199?'])]
GAuthors['PublishedDate'] = pd.to_datetime(GAuthors['PublishedDate'], format='%Y')

In [144]:
debut_authors = GAuthors.groupby('Author', as_index=False).apply(lambda x: x.loc[x['PublishedDate'].idxmin()])

In [161]:
debut_authors.iloc[150:170,:]

Unnamed: 0,Author,Title,PublishedDate,ISBN13
150,Abraham Lee,The Yellow Footprints,2020-01-01,9781645303091
151,Abraham Lee Pease,Winter Wanderings; Being an Account of Travels...,2011-01-01,9781446088890
152,Abraham Lincoln,Lincoln's Gettysburg Address,2013-01-01,9780807545515
153,"Abraham Lincoln, Simpson Matthew 1811-1884, Os...",The Lincoln Memorial,2015-01-01,9781345665161
154,Abrams,The Essential,1999-01-01,9780810958029
155,"Accu-Weather, Inc. Staff, Anc Staff Accu-Weath...",Meteorology,1994-01-01,9780801672279
156,Ace Collins,Stories Behind the Greatest Hits of Christmas,2010-01-01,9781458724571
157,"Achor, Shawn",Por Trás da Felicidade,2020-01-01,9788557170186
158,Ada Hoffmann,Monsters in My Mind,2017-01-01,9781945955082
159,Ada Palmer,Reading Lucretius in the Renaissance,2014-01-01,9780674967083


In [139]:
fgh.groupby('Author', as_index=False).apply

Index(['Author', 'Title', 'PublishedDate', 'ISBN13'], dtype='object')

In [131]:
sales = get_printbook_sales_data()

In [133]:
sales[sales['isbn13']=='9780147512284']

Unnamed: 0.1,Unnamed: 0,isbn13,datepublished,totalrevenue,totalunits,price


In [130]:
fgh.to_csv('agoogle.csv')

In [3]:
adv = get_advances_data()

In [4]:
adv = adv[(adv['Author(s)'] != 'NONE') & (adv['Date'] >= '2014-01-01')]

In [5]:
print(adv.shape, adv['Author(s)'].unique().shape)

(14913, 16) (12075,)


In [6]:
GAuthors = pd.read_csv('AuthorCrosswalkScraper/Google_Author_crosswalk_exact.csv', index_col=0)

In [7]:
rwords = ['[',']','"',"'"]
for word in rwords:
        GAuthors.loc[:, 'Author'] = GAuthors['Author'].str.replace(word,"")

In [8]:
GAuthors = GAuthors[~GAuthors['PublishedDate'].isin(['17-1', '101-', '200?', '19??', '199?'])]


In [9]:
GAuthors['PublishedDate'] = pd.to_datetime(GAuthors['PublishedDate'], format='%Y')

In [10]:
print(GAuthors.shape, GAuthors['Author'].unique().shape)

(59174, 4) (16849,)


In [11]:
G17 = GAuthors[GAuthors['PublishedDate'] >='2017']

In [12]:
print(G17.shape, G17['Author'].unique().shape)

(30487, 4) (9862,)


In [13]:
adv_14 = adv[(adv['Date'] >='2014')]

In [14]:
adv_14['Author(s)'].unique().shape

(12075,)

In [15]:
adv_14.shape

(14913, 16)

In [24]:
G17 = G17.rename(columns={
    'Author':'author',
    'Title':'title',
    'PublishedDate':'datepublished',
    'ISBN13':'isbn13'
})

In [74]:
Gsales = pd.merge(G17,sales,on='isbn13')

In [79]:
GSA = pd.merge(Gsales, adv_14, left_on='author', right_on='Author(s)')

In [82]:
GSA['fuzz_ratio'] = GSA.apply(lambda x: levenshtein_ratio(str(x['title']).lower(), str(x['Title']).lower()), axis=1)

In [89]:
GSA = GSA.groupby('isbn13', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])
GSA = GSA.groupby('advance_id', as_index=False).apply(lambda x : x.loc[x['fuzz_ratio'].idxmax()])

In [92]:
#GSA.to_csv('GAuthors_merge_sales.csv')

In [111]:
from ethnicolr import pred_fl_reg_name
d = GSA.copy()
dg = preprocess_names(d, 'author')
d1 = dg.copy()
eth = pred_fl_reg_name(dg,'Last Name', 'First Name')
d1 = d1.merge(eth, on=[
       'First Name', 'Last Name'])

d1 = d1.drop(d1.filter(like='_y').columns, axis=1)
d1.columns = [col.replace('_x', '') for col in d1.columns]

In [107]:
d1.to_csv('GAuthors_merge_sales.csv')

In [100]:
d1.columns

Index(['author', 'title', 'datepublished', 'isbn13', 'Unnamed: 0',
       'totalrevenue', 'totalunits', 'price', 'advance_id', 'Rights Category',
       'Genre', 'Date', 'Author(s)', 'Title', 'Publishers',
       'Big Publishing House Affilation', 'Advance', 'Competition', 'Awards',
       'Bestseller', 'Self Publishing', 'Debut', 'Series', 'All', 'fuzz_ratio',
       'First Name', 'Last Name', 'asian', 'hispanic', 'nh_black', 'nh_white',
       'race'],
      dtype='object')

In [112]:
d1['diff_days'] = (d1['datepublished'] - d1['Date']).dt.days

In [113]:
d1 = d1[(d1['diff_days'] >=90)]

In [114]:
d1.shape

(324, 33)

In [94]:
import pandas as pd
import re

def preprocess_names(df, Column_Name):
    #Remove space if the first letter of the word is a space
    
    df['temp_Column_Name'] = df[Column_Name]
    df['temp_Column_Name'] = df['temp_Column_Name'].str.lstrip()
    
    #Remove single letter in the Names
    #df[Column_Name] = df[Column_Name].str.replace(r'\b[A-Za-z]\b', '')
    df['temp_Column_Name'] = df['temp_Column_Name'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 1]))
    #Remove names in brackets
    #df[Column_Name] = df[Column_Name].str.replace(r'\s?\([^()]*\)', '')
    df['temp_Column_Name'] = df['temp_Column_Name'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
    
    rwords = ['Doctor','Professor','Captain','Jr','Sr','III','II', 'IV']
    for word in rwords:
        df.loc[:, 'temp_Column_Name'] = df['temp_Column_Name'].str.replace(word,"")
        
    df['temp_Column_Name'] = df['temp_Column_Name'].str.lstrip()
    
    df['First Name'] = df['temp_Column_Name'].str.split(" ", expand=True)[0]
    df['Last Name'] = df['temp_Column_Name'].str.split().str[-1]
    df.drop(columns=['temp_Column_Name'], inplace=True)
    return df

## Trying to match authors after removing punctuations

In [164]:
import string

def remove_punctuation(text):
    # Create a translation table with punctuation characters mapped to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    text = text.translate(translator)
    return text
'''
df['Author(s)'] = df['Author(s)'].apply(remove_punctuation)
df['author'] = df['author'].apply(remove_punctuation)

df['Author(s)'] = df['Author(s)'].str.replace(r'\b\w{1,2}\b', '', regex=True)
df['author'] = df['author'].str.replace(r'\b\w{1,2}\b', '', regex=True)

df['max_ratio'] = df.apply(lambda x: levenshtein_ratio(str(x['Author(s)']).lower(), str(x['author']).lower()), axis=1)
'''


"\ndf['Author(s)'] = df['Author(s)'].apply(remove_punctuation)\ndf['author'] = df['author'].apply(remove_punctuation)\n\ndf['Author(s)'] = df['Author(s)'].str.replace(r'\x08\\w{1,2}\x08', '', regex=True)\ndf['author'] = df['author'].str.replace(r'\x08\\w{1,2}\x08', '', regex=True)\n\ndf['max_ratio'] = df.apply(lambda x: levenshtein_ratio(str(x['Author(s)']).lower(), str(x['author']).lower()), axis=1)\n"

In [169]:
book = book[~book['author'].isna()]

In [175]:
adv['Author_punc'] = adv['Author(s)'].apply(remove_punctuation)
adv['Author_punc'] = adv['Author_punc'].str.replace(r'\b\w{1,2}\b', '', regex=True)
adv['Author_punc'] = adv['Author_punc'].apply(lambda x: x.lower())
book['Author_punc'] = book['author'].apply(remove_punctuation)
book['Author_punc'] = book['Author_punc'].str.replace(r'\b\w{1,2}\b', '', regex=True)
book['Author_punc'] = book['Author_punc'].apply(lambda x: x.lower())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book['Author_punc'] = book['author'].apply(remove_punctuation)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book['Author_punc'] = book['Author_punc'].str.replace(r'\b\w{1,2}\b', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book['Author_punc'] = book['Author_punc'].apply(lambda x

In [176]:
pbook_adv = pd.merge(adv, book, on ='Author_punc')

In [177]:
print(pbook_adv.shape, pbook_adv['Author_punc'].unique().shape)

(41935, 21) (3769,)
