In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [17]:
csv_file = ('../data/books.csv')
books_data = pd.read_csv(csv_file)

In [18]:
books_data.head()

Unnamed: 0,Title,Author,Genre,Height,Publisher
0,Fundamentals of Wavelets,Jaideva Goswami,signal_processing,228,Wiley
1,Data Smart,John Foreman,data_science,235,Wiley
2,God Created the Integers,Stephen Hawking,mathematics,197,Penguin
3,Superfreakonomics,Stephen Dubner,economics,179,HarperCollins
4,Orientalism,Edward Said,history,197,Penguin


In [19]:
books_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      211 non-null    object
 1   Author     187 non-null    object
 2   Genre      211 non-null    object
 3   Height     211 non-null    int64 
 4   Publisher  115 non-null    object
dtypes: int64(1), object(4)
memory usage: 8.4+ KB


In [20]:
books_data.drop(['Height'], axis=1, inplace=True) # delete the column

In [21]:
books_data.head()

Unnamed: 0,Title,Author,Genre,Publisher
0,Fundamentals of Wavelets,Jaideva Goswami,signal_processing,Wiley
1,Data Smart,John Foreman,data_science,Wiley
2,God Created the Integers,Stephen Hawking,mathematics,Penguin
3,Superfreakonomics,Stephen Dubner,economics,HarperCollins
4,Orientalism,Edward Said,history,Penguin


In [22]:
books_data.isnull().sum()

Title         0
Author       24
Genre         0
Publisher    96
dtype: int64

In [24]:
books_data['Author'].value_counts()

Author
John Steinbeck      8
P L Deshpande       7
Alex Rutherford     5
Amartya Sen         4
Ayn Rand            4
                   ..
John Grisham        1
Will Durant         1
Edgar Allen Poe     1
Michael Crichton    1
Charles Dickens     1
Name: count, Length: 129, dtype: int64

In [25]:
books_data['Publisher'].value_counts()

Publisher
Penguin             34
Random House        17
Mauj                 7
HarperCollins        6
O'Reilly             4
Vintage              3
Springer             3
Pearson              3
vikas                3
MIT Press            3
Rupa                 3
Jaico                3
Pan                  2
Routledge            2
Wiley                2
Apress               2
Prentice Hall        2
FreePress            1
Picador              1
HighStakes           1
Simon&Schuster       1
Fontana              1
Pocket               1
TMH                  1
Elsevier             1
BBC                  1
Hyperion             1
Transworld           1
HBA                  1
Bodley Head          1
CRC                  1
Orient Blackswan     1
Dell                 1
Name: count, dtype: int64

In [26]:
books_data = books_data.replace(r'^\s*$', '\u2713', regex=True)

In [27]:
books_data.head()

Unnamed: 0,Title,Author,Genre,Publisher
0,Fundamentals of Wavelets,Jaideva Goswami,signal_processing,Wiley
1,Data Smart,John Foreman,data_science,Wiley
2,God Created the Integers,Stephen Hawking,mathematics,Penguin
3,Superfreakonomics,Stephen Dubner,economics,HarperCollins
4,Orientalism,Edward Said,history,Penguin


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [29]:
# Function to convert the author name
def convert_author_name(author):
    if isinstance(author, str):  # Check if the value is a string
        parts = author.split(", ")
        if len(parts) == 2:
            first_name = parts[1]
            last_name = parts[0]
            converted_name = f"{first_name} {last_name}"
            return converted_name
    return author

# Apply the conversion function to the "Author" column
books_data['Author'] = books_data['Author'].apply(convert_author_name)

#Apply the conversation function to the "Publisher" column
books_data['Title'] = books_data['Title'].apply(convert_author_name)




In [30]:
books_data.head()

Unnamed: 0,Title,Author,Genre,Publisher
0,Fundamentals of Wavelets,Jaideva Goswami,signal_processing,Wiley
1,Data Smart,John Foreman,data_science,Wiley
2,God Created the Integers,Stephen Hawking,mathematics,Penguin
3,Superfreakonomics,Stephen Dubner,economics,HarperCollins
4,Orientalism,Edward Said,history,Penguin


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Replace np.nan with empty strings in the 'description' column
books_data['description'] = books_data['Author'] + ' ' + books_data['Genre'] +' ' + books_data['Publisher']

books_data['description'] = books_data['description'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Apply the vectorizer to the book descriptions
tfidf_matrix = vectorizer.fit_transform(books_data['description'])

# Compute the cosine similarity matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get book recommendations based on book title
def get_book_recommendations(title, cosine_similarities, books_data, top_n=5):
    # Get the index of the book with the given title
    book_index = books_data[books_data['Title'] == title].index[0]

    # Get the pair-wise similarity scores of all books with the given book
    similarity_scores = list(enumerate(cosine_similarities[book_index]))

    # Sort the books based on similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top N most similar books (excluding itself)
    top_books_indices = [index for index, _ in similarity_scores[1:top_n+1]]

    # Return the top N book titles
    return books_data['Title'].iloc[top_books_indices]

# Get book recommendations for a specific book
book_title = 'Orientalism'
recommendations = get_book_recommendations(book_title, cosine_similarities, books_data)

# Print the recommendations
print(f"Recommendations for '{book_title}':")
print(recommendations)



Recommendations for 'Orientalism':
82    India from Midnight to Milennium
74                     The Last Mughal
14                    The Age of Wrath
85                        O Jerusalem!
87                 Freedom at Midnight
Name: Title, dtype: object
