In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

  return self.fget.__get__(instance, owner)()


In [3]:
df = pd.read_csv("data/okcupid_profiles.csv")
#df_sample = df.sample(1000).reset_index(drop = True)
df_sample = df.copy()


In [4]:
df_sample.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')

In [5]:
# Concatenate all essays but essay9, which is "you should message me if..."

df_demographics = df_sample[df_sample.columns.drop(list(df_sample.filter(regex="essay")))]
essays_df = df_sample.loc[:, ["essay0", "essay1", "essay2", "essay3", "essay4", 
                   "essay5", "essay6", "essay7", "essay8"]]
essays_df = essays_df.fillna(" ").astype(str)

essays_df.loc[:, "all_essays"] = essays_df.apply(" ".join, axis = 1)

df_all = pd.concat([df_demographics, essays_df.loc[:, ["all_essays"]]], axis = 1)

In [30]:
for e in df_all.all_essays[:5]:
    print(e)
    print("NEW BIO")

about me:  i would love to think that i was some some kind of intellectual: either the dumbest smart guy, or the smartest dumb guy. can't say i can tell the difference. i love to talk about ideas and concepts. i forge odd metaphors instead of reciting cliches. like the simularities between a friend of mine's house and an underwater salt mine. my favorite word is salt by the way (weird choice i know). to me most things in life are better as metaphors. i seek to make myself a little better everyday, in some productively lazy way. got tired of tying my shoes. considered hiring a five year old, but would probably have to tie both of our shoes... decided to only wear leather shoes dress shoes.  about you:  you love to have really serious, really deep conversations about really silly stuff. you have to be willing to snap me out of a light hearted rant with a kiss. you don't have to be funny, but you have to be able to make me laugh. you should be able to bend spoons with your mind, and telep

In [6]:
# Using the pre-made model, implements the sentence embeddings.
embedding_array = model.encode(df_all.loc[:, "all_essays"])

In [7]:
def compute_cosine_similarity(target_vector, vectors):
    ''' 
    Takes in a target embedding vector and a collection of vectors of other users.
    Returns a list with the cosine similarity of all other vectors to the target vector.
    '''
    similarities = []
    for vector in vectors:
        similarity = 1 - cosine(target_vector, vector)  # 1 - cosine distance to get cosine similarity
        similarities.append(similarity)
    return similarities

In [25]:
def rank_matches(input_row, pref_gender=False, pref_age_lower=False, pref_age_higher=False):
    """ 
    Inputs:
    input_row: Index to indicate which row is the target for the cosine similarity calculation Uinteger).
    pref_gender: If the user prefers a gender in their match, the preferred gender (string).
    pref_age_lower: Indicates the user's preferred age, lower (integer).
    pref_age_higher: Indicates the user's preferred age, higher (integer).

    Outputs: 
    A list of tuples where the first value in the tuple is the index of the user, and the second value
    is the cosine similarity. This is sorted in descending order. The user themself is included in the list of tuples.
    """
    df_possible = df_all.copy()
    if pref_gender:
        df_possible = df_possible.loc[df_possible.loc[:,'sex'] == pref_gender, :]
    if pref_age_higher:
        df_possible = df_possible[df_possible.loc[:, "age"] <= pref_age_higher]
    if pref_age_lower:
        df_possible = df_possible[df_possible.loc[:, "age"] >= pref_age_lower]


    user_embeddings = embedding_array[input_row]

    other_embeddings = [embedding_array[i] for i in df_possible.index]
    # Compute the cosine similarity between the user's weighted embedding vector and all possible matches
    cosine_similarities = compute_cosine_similarity(user_embeddings, other_embeddings)
    # Recover index to match back to original dataframe
    similarity_scores = [(index, score) for index, score in enumerate(cosine_similarities)]
    # Sort by similarity
    ranked_similarity = sorted(similarity_scores, key = lambda x: x[1], reverse = True)

    return ranked_similarity

In [26]:
test = rank_matches(1100)

In [27]:
test

[(340, 1),
 (17707, 0.7433153016363178),
 (6077, 0.7307175833414558),
 (10030, 0.7244494721621603),
 (1047, 0.7076818943023682),
 (16678, 0.70697464665695),
 (961, 0.7058058977127075),
 (10342, 0.7052122178800249),
 (16870, 0.7027291655540466),
 (2832, 0.6998217701911926),
 (16741, 0.6966491522402088),
 (1545, 0.6949503836006167),
 (10091, 0.69339919090271),
 (7355, 0.6932535171508789),
 (14255, 0.6928743949978872),
 (1293, 0.6919721777422546),
 (6270, 0.6895400285720825),
 (8892, 0.6872410364603391),
 (12738, 0.6840872968461307),
 (1370, 0.6839981863499781),
 (14795, 0.6838037371635437),
 (17486, 0.6836331675271304),
 (942, 0.6806976795196533),
 (15140, 0.6793311845658315),
 (4942, 0.6777790188789368),
 (16873, 0.6765313551742658),
 (7492, 0.6738390922546387),
 (8978, 0.6728231505857032),
 (15655, 0.6705691814422607),
 (8707, 0.669306755065918),
 (10637, 0.6681664187808991),
 (9180, 0.6679528951644897),
 (7532, 0.6665170192718506),
 (12319, 0.6664845744660255),
 (1687, 0.6659700075370

In [29]:
df_all.loc[17707, "all_essays"]

"i moved to sf from the midwest about three years ago. i really love it here and i think i could stay here forever. i think i've learned a lot about myself and about people over the past few years. i wouldn't mind sharing these experiences with someone else. by which i mean experiences in general, not lecturing you about my own. get a grip.  i am clever, self-depricating, and a sock in the gut i'm teaching. i'm learning. i'm watching too much netflix. wasting time. making my ideas sound really, really good, even if they aren't.     a bike. fresh produce. words. a good friend. the internet. my glasses. neck bolts. telling people what's what.  "

In [23]:
df_all.loc[1100, "all_essays"]

'thanks for taking a moment to check me out. nice guy here. own a good business. like to go to the gym. versatile in and out of the bedroom. great kisser. smart with a nice sense of humor when you get to know me. friend and family approved. looking for someone who is serious about dating and who knows the value of dating and relationships. just your nice "average joe" working hard and doing the best i can. i have a busy business and life...always have room for someone special. kissing is one thing....oh, and i am a "creative" type as well. killer legs and calves....i do alot of spin classes. want to come with? lots to fill out. my little dog. coffee in the morning. friends. some family (some i could do without...lol). my couch (i like a good power nap).   i am either out to dinner with friends or just relaxing at home.  '