In [8]:
import numpy as np
import pandas as pd
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = '/Users/patrickokwir/Desktop/Git_Projects/Ted-Talks-Recommender-System/Data_output/talks.csv'

In [21]:
df = pd.read_csv(data, index_col=0)

In [22]:
df.head()

Unnamed: 0,author,talk,description,likes,views
0,Conor Russomanno,a powerful new neurotech tool for augmenting y...,in an astonishing talk and tech demo neurotech...,4700,157930
1,Peter Singer,a modern argument for the rights of animals,why do we prioritize human rights over those o...,7600,254482
2,Sahar Zand,why iranians are cutting their hair for woman ...,filmmaker sahar zand vividly explores the ongo...,1100,393882
3,Shannon Odell,are solar panels worth it,today in many countries solar is the cheapest ...,3700,126251
4,Angus Hervey,why are we so bad at reporting good news,why is good news so rare in a special broadcas...,1200,415329


In [23]:
# remove whitespaces from talk and description columns 
df['talk'] = df['talk'].str.strip()
df['description'] = df['description'].str.strip()

# remove stop words from talk and description columns 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['talk'] = df['talk'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# remove punctuations from talk and description columns
df['talk'] = df['talk'].apply(lambda x: ' '.join([word for word in x.split() if word not in string.punctuation]))
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in string.punctuation]))

# remove numbers from talk and description columns
df['talk'] = df['talk'].apply(lambda x: ' '.join([word for word in x.split() if word not in string.digits]))
df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in string.digits]))

In [24]:
df.head()

Unnamed: 0,author,talk,description,likes,views
0,Conor Russomanno,powerful new neurotech tool augmenting mind,astonishing talk tech demo neurotechnologist c...,4700,157930
1,Peter Singer,modern argument rights animals,prioritize human rights species philosopher pe...,7600,254482
2,Sahar Zand,iranians cutting hair woman life freedom,filmmaker sahar zand vividly explores ongoing ...,1100,393882
3,Shannon Odell,solar panels worth,today many countries solar cheapest form energ...,3700,126251
4,Angus Hervey,bad reporting good news,good news rare special broadcast ted stage jou...,1200,415329


In [25]:
# create new column for metadata 
df['metadata'] = df['author'] + ' ' + df['talk']

In [26]:
df.head()

Unnamed: 0,author,talk,description,likes,views,metadata
0,Conor Russomanno,powerful new neurotech tool augmenting mind,astonishing talk tech demo neurotechnologist c...,4700,157930,Conor Russomanno powerful new neurotech tool a...
1,Peter Singer,modern argument rights animals,prioritize human rights species philosopher pe...,7600,254482,Peter Singer modern argument rights animals
2,Sahar Zand,iranians cutting hair woman life freedom,filmmaker sahar zand vividly explores ongoing ...,1100,393882,Sahar Zand iranians cutting hair woman life fr...
3,Shannon Odell,solar panels worth,today many countries solar cheapest form energ...,3700,126251,Shannon Odell solar panels worth
4,Angus Hervey,bad reporting good news,good news rare special broadcast ted stage jou...,1200,415329,Angus Hervey bad reporting good news


In [27]:
count_vec = CountVectorizer(stop_words='english')
count_vec_matrix = count_vec.fit_transform(df['metadata'])
cosine_sim_matrix = cosine_similarity(count_vec_matrix, count_vec_matrix)

# mapping indexes 
#movies index mapping
mapping = pd.Series(df.index,index = df['talk'])


In [28]:
mapping

talk
powerful new neurotech tool augmenting mind       0
modern argument rights animals                    1
iranians cutting hair woman life freedom          2
solar panels worth                                3
bad reporting good news                           4
                                               ... 
best stats youve ever seen                     5793
schools kill creativity                        5794
greening ghetto                                5795
simplicity sells                               5796
averting climate crisis                        5797
Length: 5798, dtype: int64

In [32]:
#recommender function to recommend movies based on metadata
def reccomend_talks_based_on_metadata(talk_input):
    talk_index = mapping[talk_input]
    #get similarity values with other movies
    similarity_score = list(enumerate(cosine_sim_matrix[talk_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    # Get the scores of the 15 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:5]
    talk_indices = [i[0] for i in similarity_score]
    return (df['talk'].iloc[talk_indices])

In [36]:
reccomend_talks_based_on_metadata('solar panels worth')

155           actually adult
177    puberty changes brain
189     science falling love
252       whats smartest age
Name: talk, dtype: object