In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
df = pd.read_csv("/content/books.csv")
df.head()

Unnamed: 0,Title,Author,Genre,Height,Publisher,book_id
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,Wiley,1
1,Data Smart,"Foreman, John",data_science,235,Wiley,2
2,God Created the Integers,"Hawking, Stephen",mathematics,197,Penguin,3
3,Superfreakonomics,"Dubner, Stephen",economics,179,HarperCollins,4
4,Orientalism,"Said, Edward",history,197,Penguin,5


In [55]:
columns = ['Title', 'Author', 'Publisher']
df.shape

(211, 6)

In [56]:
def combined_features(data):
  features = []
  for i in range(0, df.shape[0]):
    features.append(df['Title'][i] + ' ' + df['Author'][i] + ' ' + df['Publisher'][i])

  return features

In [57]:
df.dtypes

Title        object
Author       object
Genre        object
Height        int64
Publisher    object
book_id       int64
dtype: object

In [58]:
df.head()

Unnamed: 0,Title,Author,Genre,Height,Publisher,book_id
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,Wiley,1
1,Data Smart,"Foreman, John",data_science,235,Wiley,2
2,God Created the Integers,"Hawking, Stephen",mathematics,197,Penguin,3
3,Superfreakonomics,"Dubner, Stephen",economics,179,HarperCollins,4
4,Orientalism,"Said, Edward",history,197,Penguin,5


In [59]:
df = df.drop("Height",axis = 'columns')

In [60]:
df.isnull().sum()

Title         0
Author       24
Genre         0
Publisher    96
book_id       0
dtype: int64

In [61]:
df['Author'] = df['Author'].fillna('Unknown')

In [62]:
df['Publisher'] = df['Publisher'].fillna('Unknown')

In [63]:
df.isnull().sum()

Title        0
Author       0
Genre        0
Publisher    0
book_id      0
dtype: int64

In [64]:
df

Unnamed: 0,Title,Author,Genre,Publisher,book_id
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,Wiley,1
1,Data Smart,"Foreman, John",data_science,Wiley,2
2,God Created the Integers,"Hawking, Stephen",mathematics,Penguin,3
3,Superfreakonomics,"Dubner, Stephen",economics,HarperCollins,4
4,Orientalism,"Said, Edward",history,Penguin,5
...,...,...,...,...,...
206,Structure and Randomness,"Tao, Terence",mathematics,Unknown,207
207,Image Processing with MATLAB,"Eddins, Steve",signal_processing,Unknown,208
208,Animal Farm,"Orwell, George",fiction,Unknown,209
209,"Idiot, The","Dostoevsky, Fyodor",fiction,Unknown,210


In [65]:
df['combined_features'] = combined_features(df)

In [66]:
df.head()

Unnamed: 0,Title,Author,Genre,Publisher,book_id,combined_features
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,Wiley,1,"Fundamentals of Wavelets Goswami, Jaideva Wiley"
1,Data Smart,"Foreman, John",data_science,Wiley,2,"Data Smart Foreman, John Wiley"
2,God Created the Integers,"Hawking, Stephen",mathematics,Penguin,3,"God Created the Integers Hawking, Stephen Penguin"
3,Superfreakonomics,"Dubner, Stephen",economics,HarperCollins,4,"Superfreakonomics Dubner, Stephen HarperCollins"
4,Orientalism,"Said, Edward",history,Penguin,5,"Orientalism Said, Edward Penguin"


In [67]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [68]:
cm = vectorizer.fit_transform(df['combined_features'])

In [69]:
cm

<211x649 sparse matrix of type '<class 'numpy.float64'>'
	with 1287 stored elements in Compressed Sparse Row format>

In [70]:
cs = cosine_similarity(cm)

In [71]:
cs

array([[1.        , 0.19348354, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.19348354, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.02969564,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.02857643,
        0.02417985],
       [0.        , 0.        , 0.02969564, ..., 0.02857643, 1.        ,
        0.02759633],
       [0.        , 0.        , 0.        , ..., 0.02417985, 0.02759633,
        1.        ]])

In [117]:
Title = df['Title'][1]
Title

'Data Smart'

In [118]:
#print(df.Title, '\n')
z = df[df.Title == Title]
z

Unnamed: 0,Title,Author,Genre,Publisher,book_id,combined_features
1,Data Smart,"Foreman, John",data_science,Wiley,2,"Data Smart Foreman, John Wiley"


In [119]:
book_id = df[df.Title == Title]['book_id'].values[0]
book_id

2

In [120]:
scores = list(enumerate(cs[book_id]))
#scores

In [121]:
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]
#sorted_scores

In [122]:
j = 0
print("The recommended books are:")
for item in sorted_scores:
  book_title = df[df.book_id == item[0]]['Title'].values[0]
  print(j+1, book_title)
  j = j+1
  if j >= 6:
    break  

The recommended books are:
1 Case of the Lame Canary, The
2 Tales of Mystery and Imagination
3 God Created the Integers
4 Data Mining Handbook
5 Machine Learning for Hackers
6 Introduction to Algorithms
