In [None]:
#!pip install sentence-transformers

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import os
import sys
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
data = '/Users/patrickokwir/Desktop/Git_Projects/Ted-Talks-Recommender-System/Data_output/ted_talk_clean_merged_bert.csv'
df = pd.read_csv(data, index_col=0)

In [3]:
df = df.head(300)

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def process_data(df, batch_size=20):
    sum_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    summarizer = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

    num_records = len(df)
    num_batches = (num_records + batch_size - 1) // batch_size

    embeddings_list = []
    with tqdm(total=num_batches, desc="Processing", colour='green') as pbar:
        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = min((batch_idx + 1) * batch_size, num_records)
            batch_df = df.iloc[batch_start:batch_end]

            tokens = {'input_ids': [], 'attention_mask': []}
            for _, row in batch_df.iterrows():

                input_transcript = sum_tokenizer(row['transcript'], max_length=1024, return_tensors="pt")
                summary_ids = summarizer.generate(input_transcript["input_ids"], num_beams=2, min_length=10, max_length=300)
                summary = sum_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
                new_tokens = tokenizer.encode_plus(summary,
                    max_length=512,
                    truncation=True,
                    padding='max_length',
                    return_tensors='pt'
                )
                tokens['input_ids'].append(new_tokens['input_ids'][0])
                tokens['attention_mask'].append(new_tokens['attention_mask'][0])
            tokens['input_ids'] = torch.stack(tokens['input_ids'])
            tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

            with torch.no_grad():
                outputs = model(**tokens)
                embeddings = outputs.last_hidden_state

                mask = tokens['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()
                mask_embeddings = embeddings * mask
                summed = torch.sum(mask_embeddings, dim=1)
                counts = torch.clamp(mask.sum(dim=1), min=1e-9)
                mean_pooled = summed / counts

            embeddings_list.append(mean_pooled.detach().cpu().numpy())
            pbar.update(1)

    embeddings = np.concatenate(embeddings_list, axis=0)
    cosine_sim = cosine_similarity(embeddings, embeddings)
    return cosine_sim

In [None]:
#instantiate pipeline 
def process_data(df):
    #instantiate a pipeline step
    
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

    #instantiate a pipeline step
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

    #instantiate a pipeline step
    tokens = {'input_ids': [], 'attention_mask': []}
    for row in df_rows:
        new_tokens = tokenizer.encode_plus(df['transcript'][row],
                                    max_length=512,
                                    truncation=True,
                                    padding='max_length',
                                    return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

    outputs = model(**tokens)
    outputs.keys()

    embeddings = outputs.last_hidden_state
    embeddings.shape

    #mean pooling
    mask = tokens['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()

    #mask embeddings
    mask_embeddings = embeddings * mask

    #summed embeddings
    summed = torch.sum(mask_embeddings, dim=1)

    #counts
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)

    #mean pooled 
    mean_pooled = summed/counts

    mean_pooled = mean_pooled.detach().numpy()

    # calculate cosine similarity for all rows using mean_pooled and cosine similarity
    cosine_sim = cosine_similarity(mean_pooled, mean_pooled)
    return cosine_sim

In [5]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
transformer = FunctionTransformer(process_data, validate=False)
pipe = Pipeline([('transformer', transformer)])

In [6]:
cosine_sim = pipe.fit_transform(df)

Processing:   0%|[32m          [0m| 0/15 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Processing: 100%|[32m██████████[0m| 15/15 [31:39<00:00, 126.65s/it]


In [7]:
import pickle

# save the model to disk
pickle.dump(cosine_sim, open("test.pkl", "wb"))

In [8]:
# open saved model
cosine_sim = pickle.load(open("test.pkl", "rb"))

In [9]:
# second, create a reverse map of indices and descriptions
indices = pd.Series(df.index, index=df['transcript'])

In [None]:
# # now we create a function recomender that will recommend simmilar products, function must take item_id and count as parameters

# # talk_to_search = 'are solar panels worth it'
# # top_n_results = 3

# def recomender(talk_to_search, top_n_results):
   
#     count = top_n_results

#     id = df.loc[df['title'] == talk_to_search].index.values[0]
#        # Get the index of the item that matches the title
#     idx = indices[id]

#     # Get the pairwsie similarity scores of all movies with that talk
#     sim_scores = list(enumerate(cosine_sim[idx]))

#     # Sort the movies based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=False)

#     # Get the scores of the 10 most similar talks
#     sim_scores = sim_scores[1:count+1]

#     # Get the talk indices
#     item_indicies = [i[0] for i in sim_scores]

#     # Return the top 10 most similar talks
#     top_talks_idx = df['transcript'].iloc[item_indicies].index[:count]
#     # get author, talk using top_talks_idx
#     top_talks_author = df['author'].iloc[item_indicies].values[:count]
#     top_talks_talk = df['title'].iloc[item_indicies].values[:count]
#     # get similarity scores using top_talks_idx
#     top_n_results_sim_scores = [list(enumerate(cosine_sim[i]))[1][1] for i in top_talks_idx]

   

#     # create a result df 
#     result_df = pd.DataFrame({'author': top_talks_author, 'title': top_talks_talk, 'sim score': top_n_results_sim_scores})
#     result_df = result_df.sort_values(by='sim score', ascending=False)

#     # rename columns
#     result_df.columns = ['author', 'title', 'sim score']

#     return result_df



#    # use indices to get 'author' and 'talk' columns from the dataframe

In [10]:
def recommender(talk_to_search, top_n_results):
    count = top_n_results

    talk_indices = df[df['title'] == talk_to_search].index.values
    if len(talk_indices) == 0:
        print("Talk not found in DataFrame")
        return None

    id = talk_indices[0]
    if id >= len(indices):
        print("Invalid talk index")
        return None

    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=False)
    sim_scores = sim_scores[1:count+1]
    item_indices = [i[0] for i in sim_scores]

    # Filter out invalid indices
    item_indices = [i for i in item_indices if i < len(df)]

    top_talks_author = df['author'].iloc[item_indices].values[:count]
    top_talks_talk = df['title'].iloc[item_indices].values[:count]
    top_n_results_sim_scores = [list(enumerate(cosine_sim[i]))[1][1] for i in item_indices]

    result_df = pd.DataFrame({'author': top_talks_author, 'title': top_talks_talk, 'sim score': top_n_results_sim_scores})
    result_df = result_df.sort_values(by='sim score', ascending=False)
    result_df.columns = ['author', 'title', 'sim score']

    return result_df


In [13]:
# get title of talk at index 25
df[df['author'] == 'Angus Hervey']['title'].iloc[0]

'why are we so bad at reporting good news'

In [18]:
recommender('why are we so bad at reporting good news', 5)

Unnamed: 0,author,title,sim score
4,Iseult Gillespie,the myth of zeus test,0.209797
3,Will Guidara,the secret ingredients of great hospitality,0.119979
2,Jerome Hunter,3 skills every middle school boy needs,0.108223
0,Ryan Heffington,how dance can unleash your inner joy,0.0426
1,Shannon Odell,how friendship affects your brain,-0.011371
