# Cleaning and optimizing the data frames

In [14]:
data1 = data1.drop("Image-URL-L", axis=1)

In [47]:
# Function to drop columns
def drop_columns(df, columns):
    df = df.drop(columns, axis=1)
    return df

data1 = drop_columns(data1, ["Image-URL-L", "Image-URL-S"])

Empty DataFrame
Columns: [User-ID, ISBN, Book-Rating]
Index: []


In [48]:
#Function to split col_1(existing column) in col_2,col_3 and col_4 ( new columns)


def split(df, col_1, col_2, col_3, col_4):
    split_data = df[col_1].str.split(', ', expand=True)
    df[col_2] = split_data[0]
    df[col_3] = split_data[1]
    df[col_4] = split_data[2]
    return df

data2 = split(data2, "Location", "City", "Region", "Country")
data2 = data2.drop("Location", axis=1)

In [49]:
# Making a column based on Age to have a reference for feature selections
conditions = [
    data2['Age'] < 18,
    (data2['Age'] >= 18) & (data2['Age'] <= 25),
    data2['Age'] > 25
]
values = ['Teen', 'Young Adult', 'Adult']

data2['Age Category'] = np.select(conditions, values)

In [43]:
display(data1)

In [80]:
display(data2)

Unnamed: 0,User-ID,Age,City,Region,Country,Age Category
0,1,,nyc,new york,usa,0
1,2,18.0,stockton,california,usa,Young Adult
2,3,,moscow,yukon territory,russia,0
3,4,17.0,porto,v.n.gaia,portugal,Teen
4,5,,farnborough,hants,united kingdom,0
...,...,...,...,...,...,...
278853,278854,,portland,oregon,usa,0
278854,278855,50.0,tacoma,washington,united kingdom,Adult
278855,278856,,brampton,ontario,canada,0
278856,278857,,knoxville,tennessee,usa,0


# Working on books 

In [134]:
# TOP BOOKS selection ( selecting based on Average Rating and Number of Ratings)

merged_data = data1.merge(data3, on='ISBN')

book_stats = merged_data.groupby(['ISBN', 'Book-Title', 'Book-Author']).agg({'Book-Rating': ['mean', 'count']}).reset_index()

book_stats = book_stats[(book_stats[('Book-Rating', 'mean')] > 7.5) & (book_stats[('Book-Rating', 'count')] > 1)]

book_stats.columns = ['ISBN', 'Book-Title', 'Book-Author', 'Average Rating', 'Number of Ratings']

book_stats = book_stats.sort_values(by='Number of Ratings', ascending=False)

top_books = book_stats[['ISBN', 'Book-Title', 'Book-Author', 'Average Rating', 'Number of Ratings']]

display(top_books)

Unnamed: 0,ISBN,Book-Title,Book-Author,Average Rating,Number of Ratings
236927,1844262553,Free,Paul Vincent,7.962963,54
189249,0836213122,There's Treasure Everywhere--A Calvin and Hobb...,Bill Watterson,7.882353,17
266855,8478886451,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,7.875000,16
231902,1577780728,Jesus Freaks: DC Talk and The Voice of the Mar...,DC Talk,7.533333,15
253248,3423071516,Der Kleine Hobbit,J. R. R. Tolkien,7.800000,15
...,...,...,...,...,...
121610,0553578227,All the Dead Lie Down,Mary Willis Walker,9.000000,2
121962,0553583859,Fields of Fire,James H. Webb,8.500000,2
122042,055358636X,Guinness World Records 2003 (Guinness World Re...,CLAIRE FOLKARD,8.000000,2
122183,0553801813,The Widow's Kiss,Jane Feather,9.000000,2


In [182]:
#Making a ta frame with just 50 rows so we can run next cell
test_data = top_books.head(50).copy()

# Display the test_data DataFrame
display(test_data)

Unnamed: 0,ISBN,Book-Title,Book-Author,Average Rating,Number of Ratings,Price
236927,1844262553,Free,Paul Vincent,7.962963,54,20.33
189249,0836213122,There's Treasure Everywhere--A Calvin and Hobb...,Bill Watterson,7.882353,17,ToBeInserted
266855,8478886451,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,7.875000,16,ToBeInserted
231902,1577780728,Jesus Freaks: DC Talk and The Voice of the Mar...,DC Talk,7.533333,15,ToBeInserted
253248,3423071516,Der Kleine Hobbit,J. R. R. Tolkien,7.800000,15,ToBeInserted
...,...,...,...,...,...,...
23859,0192816209,"Alice's Adventures in Wonderland ; And, Throug...",Lewis Carroll,8.000000,7,ToBeInserted
101820,0451450647,Robot Visions,Isaac Asimov,7.571429,7,ToBeInserted
157120,0743219333,Baby Catcher : Chronicles of a Modern Midwife,Peggy Vincent,8.142857,7,ToBeInserted
103882,0452279615,"The Drawing of the Three (The Dark Tower, Book 2)",Stephen King,8.857143,7,ToBeInserted


In [183]:
# Going trough the API for book descriptions 
def get_book_description(isbn):
    url = f'https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}'
    response = requests.get(url)
    data = response.json()

    if 'items' in data and len(data['items']) > 0:
        book_info = data['items'][0]['volumeInfo']
        description = book_info.get('description', 'No description available')
        return description
    
    return 'No description available'

def add_description_to_dataframe(df):
    df['Description'] = ''

    for index, row in df.iterrows():
        isbn = row['ISBN']
        description = get_book_description(isbn)
        df.at[index, 'Description'] = description

    return df

test_data = add_description_to_dataframe(test_data)

# Display the updated test_data DataFrame with the 'Description' column
display(test_data[['ISBN', 'Book-Title', 'Book-Author', 'Average Rating', 'Number of Ratings', 'Description']])

Unnamed: 0,ISBN,Book-Title,Book-Author,Average Rating,Number of Ratings,Description
236927,1844262553,Free,Paul Vincent,7.962963,54,A noir comedy featuring Sal whose new life in ...
189249,0836213122,There's Treasure Everywhere--A Calvin and Hobb...,Bill Watterson,7.882353,17,In the world that Calvin and his tiger Hobbes ...
266855,8478886451,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,7.875000,16,"Tras otro abominable verano con los Dursley, H..."
231902,1577780728,Jesus Freaks: DC Talk and The Voice of the Mar...,DC Talk,7.533333,15,Remember the Lord's people who are in jail and...
253248,3423071516,Der Kleine Hobbit,J. R. R. Tolkien,7.800000,15,No description available
...,...,...,...,...,...,...
23859,0192816209,"Alice's Adventures in Wonderland ; And, Throug...",Lewis Carroll,8.000000,7,No description available
101820,0451450647,Robot Visions,Isaac Asimov,7.571429,7,"From Isaac Asimov, the Hugo Award-winning Gran..."
157120,0743219333,Baby Catcher : Chronicles of a Modern Midwife,Peggy Vincent,8.142857,7,A beloved California midwife with more than th...
103882,0452279615,"The Drawing of the Three (The Dark Tower, Book 2)",Stephen King,8.857143,7,No description available


In [185]:
#Going trough the API for genre 
def get_book_genre(isbn):
    url = f'https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}'
    response = requests.get(url)
    data = response.json()

    if 'items' in data and len(data['items']) > 0:
        book_info = data['items'][0]['volumeInfo']
        genre = book_info.get('categories', 'Genre not available')
        return genre
    
    return 'Genre not available'

def add_genre_to_dataframe(df):
    df['Genre'] = ''

    for index, row in df.iterrows():
        isbn = row['ISBN']
        genre = get_book_genre(isbn)
        df.at[index, 'Genre'] = genre

    return df

test_data = add_genre_to_dataframe(test_data)


display(test_data[['ISBN', 'Book-Title', 'Book-Author', 'Average Rating', 'Number of Ratings', 'Description', 'Genre']])

Unnamed: 0,ISBN,Book-Title,Book-Author,Average Rating,Number of Ratings,Description,Genre
236927,1844262553,Free,Paul Vincent,7.962963,54,A noir comedy featuring Sal whose new life in ...,[Fiction]
189249,0836213122,There's Treasure Everywhere--A Calvin and Hobb...,Bill Watterson,7.882353,17,In the world that Calvin and his tiger Hobbes ...,[Humor]
266855,8478886451,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,7.875000,16,"Tras otro abominable verano con los Dursley, H...",[Juvenile Fiction]
231902,1577780728,Jesus Freaks: DC Talk and The Voice of the Mar...,DC Talk,7.533333,15,Remember the Lord's people who are in jail and...,[Christian martyrs]
253248,3423071516,Der Kleine Hobbit,J. R. R. Tolkien,7.800000,15,No description available,Genre not available
...,...,...,...,...,...,...,...
23859,0192816209,"Alice's Adventures in Wonderland ; And, Throug...",Lewis Carroll,8.000000,7,No description available,Genre not available
101820,0451450647,Robot Visions,Isaac Asimov,7.571429,7,"From Isaac Asimov, the Hugo Award-winning Gran...",[Fiction]
157120,0743219333,Baby Catcher : Chronicles of a Modern Midwife,Peggy Vincent,8.142857,7,A beloved California midwife with more than th...,[Childbirth at home]
103882,0452279615,"The Drawing of the Three (The Dark Tower, Book 2)",Stephen King,8.857143,7,No description available,Genre not available


In [138]:
# Adding Column Price
top_books['Price'] = 'ToBeInserted'

In [140]:

# I made you a function that allows you to add prices to books based on ISBN



def update_price(top_books, isbn, price):
    # Check if the provided ISBN exists in the top_books DataFrame
    if isbn in top_books['ISBN'].values:
        # Update the price for the specified ISBN
        top_books.loc[top_books['ISBN'] == isbn, 'Price'] = price
        print(f"Price updated for ISBN {isbn}")
    else:
        print(f"ISBN {isbn} not found in top_books DataFrame")

# Example usage
isbn = "1844262553"  
price = 20.33 


update_price(top_books, isbn, price)



Price updated for ISBN 1844262553


Unnamed: 0,ISBN,Book-Title,Book-Author,Average Rating,Number of Ratings,Price
236927,1844262553,Free,Paul Vincent,7.962963,54,20.33
189249,0836213122,There's Treasure Everywhere--A Calvin and Hobb...,Bill Watterson,7.882353,17,ToBeInserted
266855,8478886451,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,7.875000,16,ToBeInserted
231902,1577780728,Jesus Freaks: DC Talk and The Voice of the Mar...,DC Talk,7.533333,15,ToBeInserted
253248,3423071516,Der Kleine Hobbit,J. R. R. Tolkien,7.800000,15,ToBeInserted
...,...,...,...,...,...,...
121610,0553578227,All the Dead Lie Down,Mary Willis Walker,9.000000,2,ToBeInserted
121962,0553583859,Fields of Fire,James H. Webb,8.500000,2,ToBeInserted
122042,055358636X,Guinness World Records 2003 (Guinness World Re...,CLAIRE FOLKARD,8.000000,2,ToBeInserted
122183,0553801813,The Widow's Kiss,Jane Feather,9.000000,2,ToBeInserted


In [133]:
# Filter on author
def filter_books_by_author(top_books, author):
    filtered_books = top_books[top_books['Book-Author'] == author]
    return filtered_books
author_name = input("Enter the name of the author: ")
filtered_books = filter_books_by_author(top_books, author_name)
display(filtered_books)

Enter the name of the author: David Rakoff


Unnamed: 0,ISBN,Book-Title,Book-Author,Average Rating,Number of Ratings
270144,B000234N3A,Fraud,David Rakoff,9.0,1


In [103]:
# This is a function to filter the books based on Age Category and Country, both or just one. Insert data or leve it like this('') if you whant to skip one of the filters. 
def get_category_books(df1, df2, df3, filter_country=None, filter_age_category=None, num_books=20):
    
    if filter_country:
        filtered_data2 = data2[data2['Country'] == filter_country]
    else:
        filtered_data2 = data2
    
    if filter_age_category:
        filtered_data2 = filtered_data2[filtered_data2['Age Category'] == filter_age_category]
    
    category_user_ids = filtered_data2['User-ID']
    
    
    filtered_data3 = data3[data3['User-ID'].isin(category_user_ids)]  # Filter data3 based on the filtered user IDs
    
   
    merged_data = data1.merge(filtered_data3, on='ISBN')
    
    
    book_stats = merged_data.groupby(['ISBN', 'Book-Title']).agg({'Book-Rating': ['mean', 'count']}).reset_index()
    book_stats = book_stats[(book_stats[('Book-Rating', 'mean')] > 6) & (book_stats[('Book-Rating', 'count')] > 0)]
    book_stats = book_stats.sort_values(by=[('Book-Rating', 'count')], ascending=False)
    book_stats.columns = ['ISBN', 'Book-Title', 'Average Rating', 'Number of Ratings']
    
    # Display the top books
    top_books = book_stats.head(num_books)
    display(top_books[['ISBN', 'Book-Title', 'Average Rating', 'Number of Ratings']])

In [112]:
#Call the function
get_category_books(data1, data2, data3, filter_country='portugal', filter_age_category='', num_books=5)

Unnamed: 0,ISBN,Book-Title,Average Rating,Number of Ratings
1471,0439139600,Harry Potter and the Goblet of Fire (Book 4),7.3,10
1229,038082101X,Daughter of Fortune: A Novel,6.7,10
1466,0439064864,Harry Potter and the Chamber of Secrets (Book 2),7.125,8
2101,0553573403,"A Game of Thrones (A Song of Ice and Fire, Boo...",6.333333,6
2006,0553213695,The Metamorphosis (Bantam Classics),6.166667,6


# Making a data frame with authors from top_books and description for each

In [137]:
# I made a new data base for authors
authors = []
descriptions = []

# Iterate over the first 50 books in top_books DataFrame
for index, row in top_books.head(50).iterrows():
    author = row['Book-Author']
    try:
        
        response = requests.get(f"https://en.wikipedia.org/api/rest_v1/page/summary/{author}")
        
       
        if response.status_code == 200:
            data = response.json()
            summary = data.get('extract', 'Description not available')
            
            authors.append(author)
            descriptions.append(summary)
        else:
            
            authors.append(author)
            descriptions.append('Description not available')
    except requests.exceptions.RequestException:
        
        authors.append(author)
        descriptions.append('Description not available')


author_df = pd.DataFrame({'Author': authors, 'Description': descriptions})

# Display the DataFrame
display(author_df)

Unnamed: 0,Author,Description
0,Paul Vincent,Description not available
1,Bill Watterson,William Boyd Watterson II is an American carto...
2,J. K. Rowling,"Joanne Rowling, best known by her pen name J. ..."
3,DC Talk,DC Talk is a Christian rap and rock trio. The ...
4,J. R. R. Tolkien,John Ronald Reuel Tolkien was an English write...
...,...,...
45,Lewis Carroll,"Charles Lutwidge Dodgson, better known by his ..."
46,Isaac Asimov,Isaac Asimov was an American writer and profes...
47,Peggy Vincent,Description not available
48,Stephen King,Stephen Edwin King is an American author of ho...
