In [1]:
import json
import random

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from pandas import json_normalize

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


## Data Cleaning and Cosine Similarity Calculations

In [2]:
#reading in the dataframe
file = 'best_selling_books_2.csv'
df = pd.read_csv(file)
df

#changing volume sales to a numeric type
df['Volume Sales'] = df["Volume Sales"].str.replace(",", "").astype(int)

In [3]:
author_features = {}

In [4]:
#iterate through the books, and accumulate the features per author, including the total number of sales, publishers, and genres
for _, row in df.iterrows():
    
    author = row['Author'] 
    
    #adding the author if they have not been added yet 
    if author not in author_features:
        author_features[author] = {'volume_sales': 0, 'publisher': set(), 'genre': set()}
        
    author_features[author]['volume_sales'] += row['Volume Sales']
    author_features[author]['publisher'].add(row['Publisher'])
    author_features[author]['genre'].add(row['Genre'])
    
    
#converting each author's features into the dataframe
author_data = []
for author, features in author_features.items():
    author_data.append({
        'Author': author,
        'Volume Sales': features['volume_sales'],
        'Publisher Count': len(features['publisher']),
        'Genre Count': len(features['genre']),
        'Publishers': list(features['publisher']),
        'Genres': list(features['genre'])
    })
author_df = pd.DataFrame(author_data)

In [53]:
author_df = author_df.sort_values(by='Volume Sales', ascending = False)
author_df

Unnamed: 0,Author,Volume Sales,Publisher Count,Genre Count,Publishers,Genres,Publisher_Genre
1,"Rowling, J.K.",28494745,1,2,[Bloomsbury],"[Children's Fiction, Science Fiction & Fantasy]",Bloomsbury Children's Fiction Science Fiction ...
0,"Brown, Dan",14462827,1,1,[Transworld],"[Crime, Thriller & Adventure]","Transworld Crime, Thriller & Adventure"
2,"James, E. L.",8432648,1,1,[Random House],[Romance & Sagas],Random House Romance & Sagas
3,"Meyer, Stephenie",8308136,1,1,"[Little, Brown Book]",[Young Adult Fiction],"Little, Brown Book Young Adult Fiction"
4,"Larsson, Stieg",5603489,1,1,[Quercus],"[Crime, Thriller & Adventure]","Quercus Crime, Thriller & Adventure"
...,...,...,...,...,...,...,...
62,"Morton, Kate",814370,1,1,[Pan Macmillan],[General & Literary Fiction],Pan Macmillan General & Literary Fiction
63,"Zusak, Markus",809641,1,1,[Transworld],[General & Literary Fiction],Transworld General & Literary Fiction
64,"Binchy, Maeve",808900,1,1,[Orion],[General & Literary Fiction],Orion General & Literary Fiction
65,"Harris, Robert",807311,1,1,[Random House],[General & Literary Fiction],Random House General & Literary Fiction


In [42]:
#combining genre and publisher into one line
author_df['Publisher_Genre'] = author_df['Publishers'].apply(lambda x: ' '.join(x)) + ' ' + author_df['Genres'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(author_df['Publisher_Genre'])

#calculating cosine similarity matrix
cos_sim = cosine_similarity(X, X)
#remove self comparisons
np.fill_diagonal(cos_sim, 0)


## Making the Comparisons

In [55]:
#comparing J.K. Rowling to other authors
jk_idx = author_df[author_df['Author'] == 'Rowling, J.K.'].index[0] #index of J.K. Rowling
jk_similarity_scores = cos_sim[jk_idx] #similarity scores for J.K. Rowling

#sort indices according to similarity to J.K. Rowling
sorted_indices = np.argsort(jk_similarity_scores)[::-1]

#show the most similar authors
jk_similars = author_df.iloc[sorted_indices][0:3]
jk_similars

Unnamed: 0,Author,Volume Sales,Publisher Count,Genre Count,Publishers,Genres,Publisher_Genre
46,"Tolkien, J. R. R.",967466,1,1,[HarperCollins],[Science Fiction & Fantasy],HarperCollins Science Fiction & Fantasy
21,"Pelzer, Dave",2009219,1,2,[Orion],"[Autobiography: General, Biography: General]",Orion Autobiography: General Biography: General
32,"McKeith, Gillian",1104403,1,1,[Penguin],[Fitness & Diet],Penguin Fitness & Diet


In [56]:
#comparing Dan Brown to other authors
db_idx = author_df[author_df['Author'] == 'Brown, Dan'].index[0] #index of Dan Brown
db_similarity_scores = cos_sim[db_idx] #similarity scores for Dan Brown

#sort indices according to similarity to Dan Brown
db_sorted_indices = np.argsort(db_similarity_scores)[::-1]

#show the most similar authors
db_similars = author_df.iloc[db_sorted_indices][0:3]
db_similars

Unnamed: 0,Author,Volume Sales,Publisher Count,Genre Count,Publishers,Genres,Publisher_Genre
48,"Rubenfeld, Jed",962515,1,1,[Headline],"[Crime, Thriller & Adventure]","Headline Crime, Thriller & Adventure"
4,"Larsson, Stieg",5603489,1,1,[Quercus],"[Crime, Thriller & Adventure]","Quercus Crime, Thriller & Adventure"
33,"Zafon, Carlos Ruiz",1092349,1,1,[Orion],[General & Literary Fiction],Orion General & Literary Fiction


In [57]:
#comparing E.L. James to other authors
el_idx = author_df[author_df['Author'] == 'James, E. L.'].index[0] #index of E.L. James
el_similarity_scores = cos_sim[el_idx] #similarity scores for E.L. James

#sort indices according to similarity to E.L. James
el_sorted_indices = np.argsort(el_similarity_scores)[::-1]

#show the most similar authors
el_similars = author_df.iloc[el_sorted_indices][0:3]
el_similars

Unnamed: 0,Author,Volume Sales,Publisher Count,Genre Count,Publishers,Genres,Publisher_Genre
14,"McEwan, Ian",1539428,1,1,[Random House],[General & Literary Fiction],Random House General & Literary Fiction
35,"Atkins, Robert C.",1054196,1,1,[Random House],[Fitness & Diet],Random House Fitness & Diet
13,"Niffenegger, Audrey",1546886,1,1,[Random House],[General & Literary Fiction],Random House General & Literary Fiction
