In [1]:
#!pip install sentence-transformers

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
data = '/Users/patrickokwir/Desktop/Git_Projects/Ted-Talks-Recommender-System/Data_output/ted_talk_clean_merged_bert.csv'
df = pd.read_csv(data, index_col=0)
df.head()

Unnamed: 0,author,title,description,likes,views,transcript,date,tags
0,Machine Dazzle,how to unleash your inner maximalist through c...,tapping into the transformational power of cos...,8100,270192,"Hello, I am Machine Dazzle, and I am an emotio...",Jun 2023,"art, creativity, design, fashion, performance"
1,Jioji Ravulo,a liberating vision of identity that transcend...,how can we move past societys inclination to b...,9200,309952,Can you paint with all the colors of the wind?...,Jun 2023,"diversity, identity, inclusion, indigenous_peo..."
2,Rebecca Darwent,how to fund real change in your community,is there a way to give back that benefits ever...,1000,341218,I spent my whole career in the nonprofit secto...,Jun 2023,"business, community, equality, humanity, money..."
3,Susanne Buckley-Zistel,what caused the rwandan genocide,for one hundred days in 1994 the african count...,3700,126376,"For 100 days in 1994, the African country of R...",Jun 2023,"africa, animation, education, history, identit..."
4,Conor Russomanno,a powerful new neurotech tool for augmenting y...,in an astonishing talk and tech demo neurotech...,1100,374259,I became obsessed with the relationship betwee...,Jun 2023,"biotech, brain, disability, health, invention,..."


In [4]:
df_rows = df.index.tolist()
df_rows = df_rows[0:500]

In [5]:
#instantiate pipeline 
def process_data(df):
    #instantiate a pipeline step
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

    #instantiate a pipeline step
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

    #instantiate a pipeline step
    tokens = {'input_ids': [], 'attention_mask': []}
    for row in df_rows:
        new_tokens = tokenizer.encode_plus(df['transcript'][row],
                                    max_length=512,
                                    truncation=True,
                                    padding='max_length',
                                    return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

    outputs = model(**tokens)
    outputs.keys()

    embeddings = outputs.last_hidden_state
    embeddings.shape

    #mean pooling
    mask = tokens['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()

    #mask embeddings
    mask_embeddings = embeddings * mask

    #summed embeddings
    summed = torch.sum(mask_embeddings, dim=1)

    #counts
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)

    #mean pooled 
    mean_pooled = summed/counts

    mean_pooled = mean_pooled.detach().numpy()

    # calculate cosine similarity for all rows using mean_pooled and cosine similarity
    cosine_sim = cosine_similarity(mean_pooled, mean_pooled)
    return cosine_sim

In [6]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
transformer = FunctionTransformer(process_data, validate=False)
pipe = Pipeline([('transformer', transformer)])

In [7]:
cosine_sim = pipe.fit_transform(df)

: 

: 

In [None]:
import pickle

# save the model to disk
pickle.dump(cosine_sim, open("test.pkl", "wb"))

In [None]:
# open saved model
cosine_sim = pickle.load(open("test.pkl", "rb"))

In [None]:
# second, create a reverse map of indices and descriptions
indices = pd.Series(df.index, index=df['transcript'])

In [None]:
# # now we create a function recomender that will recommend simmilar products, function must take item_id and count as parameters

# # talk_to_search = 'are solar panels worth it'
# # top_n_results = 3

# def recomender(talk_to_search, top_n_results):
   
#     count = top_n_results

#     id = df.loc[df['title'] == talk_to_search].index.values[0]
#        # Get the index of the item that matches the title
#     idx = indices[id]

#     # Get the pairwsie similarity scores of all movies with that talk
#     sim_scores = list(enumerate(cosine_sim[idx]))

#     # Sort the movies based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=False)

#     # Get the scores of the 10 most similar talks
#     sim_scores = sim_scores[1:count+1]

#     # Get the talk indices
#     item_indicies = [i[0] for i in sim_scores]

#     # Return the top 10 most similar talks
#     top_talks_idx = df['transcript'].iloc[item_indicies].index[:count]
#     # get author, talk using top_talks_idx
#     top_talks_author = df['author'].iloc[item_indicies].values[:count]
#     top_talks_talk = df['title'].iloc[item_indicies].values[:count]
#     # get similarity scores using top_talks_idx
#     top_n_results_sim_scores = [list(enumerate(cosine_sim[i]))[1][1] for i in top_talks_idx]

   

#     # create a result df 
#     result_df = pd.DataFrame({'author': top_talks_author, 'title': top_talks_talk, 'sim score': top_n_results_sim_scores})
#     result_df = result_df.sort_values(by='sim score', ascending=False)

#     # rename columns
#     result_df.columns = ['author', 'title', 'sim score']

#     return result_df



#    # use indices to get 'author' and 'talk' columns from the dataframe

In [None]:
def recommender(talk_to_search, top_n_results):
    count = top_n_results

    talk_indices = df[df['title'] == talk_to_search].index.values
    if len(talk_indices) == 0:
        print("Talk not found in DataFrame")
        return None

    id = talk_indices[0]
    if id >= len(indices):
        print("Invalid talk index")
        return None

    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=False)
    sim_scores = sim_scores[1:count+1]
    item_indices = [i[0] for i in sim_scores]

    # Filter out invalid indices
    item_indices = [i for i in item_indices if i < len(df)]

    top_talks_author = df['author'].iloc[item_indices].values[:count]
    top_talks_talk = df['title'].iloc[item_indices].values[:count]
    top_n_results_sim_scores = [list(enumerate(cosine_sim[i]))[1][1] for i in item_indices]

    result_df = pd.DataFrame({'author': top_talks_author, 'title': top_talks_talk, 'sim score': top_n_results_sim_scores})
    result_df = result_df.sort_values(by='sim score', ascending=False)
    result_df.columns = ['author', 'title', 'sim score']

    return result_df


In [None]:
# get title of talk at index 25
df[df['author'] == 'Conor Russomanno']['title'].iloc[0]

In [None]:
recommender('a powerful new neurotech tool for augmenting your mind', 5)