## Import necessary libraries

In [1]:
!pip install surprise



In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib

# For better readability, let's display all columns in the dataframe outputs
pd.set_option('display.max_columns', None)

## Data Preprocessing

### Books dataset

In [3]:
books = pd.read_csv('books.csv', error_bad_lines=False)



  books = pd.read_csv('books.csv', error_bad_lines=False)
Skipping line 3350: expected 12 fields, saw 13
Skipping line 4704: expected 12 fields, saw 13
Skipping line 5879: expected 12 fields, saw 13
Skipping line 8981: expected 12 fields, saw 13



In [4]:
books.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7     num_pages         11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [6]:
# Remove leading whitespace from column names
books.columns = books.columns.str.strip()

In [7]:
books['publication_date'] = pd.to_datetime(books['publication_date'], errors='coerce')

In [8]:
books['years_since_publication'] = pd.Timestamp.now().year - books['publication_date'].dt.year

In [9]:
books.drop(['isbn', 'isbn13', 'publisher', 'publication_date'], axis=1, inplace=True)

In [10]:
books.head()

Unnamed: 0,bookID,title,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,years_since_publication
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,eng,652,2095690,27591,18.0
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,eng,870,2153167,29221,20.0
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,eng,352,6333,244,21.0
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,eng,435,2339585,36325,20.0
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,eng,2690,41428,164,20.0


In [11]:
books.shape

(11123, 9)

### Music dataset

In [12]:
music = pd.read_csv('dataset.csv')
music.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [13]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [14]:
music.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
# Convert explicit column to int (1 for True, 0 for False)
music['explicit'] = music['explicit'].astype(int)

In [16]:
music['duration_minutes'] = music['duration_ms'] / 60000

In [17]:
music['genres'] = music['track_genre']

In [18]:
music.drop(['duration_ms', 'album_name', 'track_genre'], axis=1, inplace=True)

In [19]:
music.head()

Unnamed: 0,track_id,artists,track_name,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_minutes,genres
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,73,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,3.844433,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,55,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,2.4935,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,57,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,3.513767,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,71,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,3.36555,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,82,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,3.314217,acoustic


In [20]:
music.shape

(114000, 19)

### Movies dataset

In [21]:
movies = pd.read_csv('movies.csv')
movies.head()

  movies = pd.read_csv('movies.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [23]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

In [24]:
movies['years_since_release'] = pd.Timestamp.now().year - movies['release_date'].dt.year

In [25]:
movies.drop(['adult', 'belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'production_companies', 'production_countries', 'status', 'tagline', 'video'], axis=1, inplace=True)

In [26]:
movies.head()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,release_date,revenue,runtime,spoken_languages,title,vote_average,vote_count,years_since_release
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,7.7,5415.0,29.0
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,6.9,2413.0,29.0
2,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men,6.5,92.0,29.0
3,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Waiting to Exhale,6.1,34.0,29.0
4,0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,5.7,173.0,29.0


In [27]:
movies.shape

(45466, 15)

### Ted-talks dataset

In [28]:
ted_talks = pd.read_csv('ted_talks.csv')
ted_talks.head()

Unnamed: 0,title,author,date,views,likes,link
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...


In [29]:
ted_talks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5440 entries, 0 to 5439
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5440 non-null   object
 1   author  5439 non-null   object
 2   date    5440 non-null   object
 3   views   5440 non-null   int64 
 4   likes   5440 non-null   int64 
 5   link    5440 non-null   object
dtypes: int64(2), object(4)
memory usage: 255.1+ KB


In [30]:
ted_talks['date'] = pd.to_datetime(ted_talks['date'], errors='coerce')

In [31]:
ted_talks['years_since_talk'] = pd.Timestamp.now().year - ted_talks['date'].dt.year

In [32]:
ted_talks.drop('link', axis=1, inplace=True)

In [33]:
ted_talks = ted_talks.dropna(subset=['title', 'author'])

In [34]:
ted_talks.head()

Unnamed: 0,title,author,date,views,likes,years_since_talk
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,2021-12-01,404000,12000,3
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,2022-02-01,214000,6400,2
2,How play can spark new ideas for your business,Martin Reeves,2021-09-01,412000,12000,3
3,Why is China appointing judges to combat clima...,James K. Thornton,2021-10-01,427000,12000,3
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,2021-10-01,2400,72,3


In [35]:
ted_talks.shape

(5439, 6)

## Feature Engineering and Model Training

### Recommendation System (Content-Based Filtering)

<b>Vectorize data</b>

In [36]:
tfidf_books = TfidfVectorizer(stop_words='english')
tfidf_matrix_books = tfidf_books.fit_transform(books['title'])

In [37]:
tfidf_music = TfidfVectorizer(stop_words='english')
tfidf_matrix_music = tfidf_music.fit_transform(music['genres'])

In [38]:
tfidf_movies = TfidfVectorizer(stop_words='english')
tfidf_matrix_movies = tfidf_movies.fit_transform(movies['genres'])

In [39]:
tfidf_ted_talks = TfidfVectorizer(stop_words='english')
tfidf_matrix_ted_talks = tfidf_ted_talks.fit_transform(ted_talks['title'] + ' ' + ted_talks['author'])

<b> Fit Nearest Neighbors Model </b>

In [40]:
# Nearest neighbors for books
knn_books = NearestNeighbors(metric='cosine', algorithm='brute')
knn_books.fit(tfidf_matrix_books)

In [41]:
# Nearest neighbors for music
knn_music = NearestNeighbors(metric='cosine', algorithm='brute')
knn_music.fit(tfidf_matrix_music)

In [42]:
# Nearest neighbors for movies
knn_movies = NearestNeighbors(metric='cosine', algorithm='brute')
knn_movies.fit(tfidf_matrix_movies)

In [43]:
# Nearest neighbors for TED talks
knn_ted_talks = NearestNeighbors(metric='cosine', algorithm='brute')
knn_ted_talks.fit(tfidf_matrix_ted_talks)

### Create Recommendation System

In [44]:
def recommend(user_input, category):
    if category == 'book':
        query = tfidf_books.transform([user_input])
        distances, indices = knn_books.kneighbors(query, n_neighbors=5)
        return books.iloc[indices[0]]['title'].tolist()

    elif category == 'music':
        query = tfidf_music.transform([user_input])
        distances, indices = knn_music.kneighbors(query, n_neighbors=5)
        return music.iloc[indices[0]]['track_name'].tolist()

    elif category == 'movie':
        query = tfidf_movies.transform([user_input])
        distances, indices = knn_movies.kneighbors(query, n_neighbors=5)
        return movies.iloc[indices[0]]['title'].tolist()

    elif category == 'ted_talk':
        query = tfidf_ted_talks.transform([user_input])
        distances, indices = knn_ted_talks.kneighbors(query, n_neighbors=5)
        return ted_talks.iloc[indices[0]]['title'].tolist()

    else:
        return "Invalid category. Choose from 'book', 'music', 'movie', or 'ted_talk'."

In [45]:
recommend('harry potter', 'book')

['Harry Potter Collection (Harry Potter  #1-6)',
 'Harry Potter and the Goblet of Fire (Harry Potter  #4)',
 'Harry Potter and the Chamber of Secrets (Harry Potter  #2)',
 'Harry Potter and the Chamber of Secrets (Harry Potter  #2)',
 'Harry Potter and the Order of the Phoenix (Harry Potter  #5)']

In [46]:
recommend('acoustic', 'music')

['Easy on Me',
 'Closer',
 'Young And In Love',
 'Tonight You Belong To Me',
 'I Wanna Be Your Ghost (feat. Ghosts)']

In [47]:
recommend('inception', 'movie')

['The Colour Out of Space',
 'Festival',
 'Ghost Graduation',
 'Foxtrot',
 '5 Flights Up']

In [48]:
recommend('technology', 'ted_talk')

['How technology evolves',
 'On technology and faith',
 'The technology of storytelling',
 'The technology of the heart',
 "Technology's epic story"]