In [83]:
import pandas as pd
import numpy as np
import torch
from torchtext.vocab import GloVe
from collections import defaultdict
from scipy.spatial.distance import cosine


In [35]:
#read in the output from classifier and rename columns for clarity
probabilities = pd.read_csv('classifier/classifier_outputs.csv', index_col=0).rename(columns = {'drugs': 'drugs_uses',
                                                                                                'drugs.1': 'drugs'})

In [84]:
# Let's have a look
probabilities.columns

Index(['index', 'age', 'status', 'sex', 'orientation', 'body_type', 'diet',
       'drinks', 'drugs_uses', 'education', 'ethnicity', 'height', 'income',
       'job', 'location', 'offspring', 'pets', 'religion', 'sign', 'smokes',
       'speaks', 'essay0', 'travel', 'drama', 'kids', 'TV', 'music',
       'comedies', 'drinking', 'movies', 'books', 'food', 'drugs', 'sequence',
       'sentiment_label', 'sentiment_score'],
      dtype='object')

In [85]:
# Read in glove embeddings. The reason we're using glove embeddings is because not all interests are equally different.
# For example, drinking and food are more similar than drama and food, so we want to account for that semantic similarity.
glove = GloVe(name = "6B", dim = 50)

In [86]:
names = list(probabilities.columns[22:33])

In [87]:
# Create vocab for each possible interest
vocab = defaultdict(lambda: glove['unknown'])  # Default to 'unknown' vector for missing words
for name in names:
    words = name.split()
    vectors = [glove[word.lower()] for word in words if word.lower() in glove.stoi]
    
    # Average the vectors for words in the name if it's a compound, else just use the vector
    if vectors:
        vocab[name] = torch.mean(torch.stack(vectors), dim=0)
    else:
        vocab[name] = glove['unknown']  # Use 'unknown' vector if none of the words in the name are in GloVe

In [89]:
# Function for weighting the embedding vectors based on the input's probability of falling into that interest
def get_weighted_vectors(row, vocab=vocab):
    weighted_vectors = []
    for col in names:
        if col in vocab:
            weighted_vector = row[col] * vocab[col]
            weighted_vectors.append(weighted_vector)
    
    return sum(weighted_vectors)

In [93]:
# Function for returning a list of cosine similarities between user interests and all possible matches' interests
def compute_cosine_similarity(target_vector, vectors):
    similarities = []
    for vector in vectors:
        similarity = 1 - cosine(target_vector, vector)  # 1 - cosine distance to get cosine similarity
        similarities.append(similarity)
    return similarities
    

In [100]:
# Putting it all together!
def rank_matches(input_probabilities, pref_gender=False, pref_age_lower=False, pref_age_higher=False):
    # First, let's filter the dataframe so we're only dealing with possible matches. For now, we're only looking at
    # the user's gender preference and age preference, but that can be expanded.
    df_possible = probabilities.copy()
    if pref_gender:
        df_possible = df_possible.loc[df_possible.loc[:,'sex'] == pref_gender, :]
    if pref_age_higher:
        df_possible = df_possible[df_possible.loc[:, "age"] <= pref_age_higher]
    if pref_age_lower:
        df_possible = df_possible[df_possible.loc[:, "age"] >= pref_age_lower]

    
    # Create weighted vectors for all observations in the dataset
    representation_vectors = df_possible.apply(lambda row: get_weighted_vectors(row), axis = 1)
    weighted_user = get_weighted_vectors(input_probabilities)

    # Compute the cosine similarity between the user's weighted embedding vector and all possible matches
    cosine_similarities = compute_cosine_similarity(weighted_user, representation_vectors)

    # Recover index to match back to original dataframe
    similarity_scores = [(index, score) for index, score in enumerate(cosine_similarities)]
    # Sort by similarity
    ranked_similarity = sorted(similarity_scores, key = lambda x: x[1], reverse = True)

    return ranked_similarity


In [101]:
probabilities

Unnamed: 0,index,age,status,sex,orientation,body_type,diet,drinks,drugs_uses,education,...,music,comedies,drinking,movies,books,food,drugs,sequence,sentiment_label,sentiment_score
0,5868,26,single,f,straight,,mostly anything,socially,never,,...,0.077048,0.073351,0.066264,0.046497,0.046491,0.0464,0.020955,"i'm just a sweet, caring girl looking for what...",POSITIVE,0.997532
1,628,30,single,f,straight,curvy,,socially,,working on masters program,...,0.069811,0.003712,0.004391,0.046642,0.027142,0.030537,0.002358,my attempt at nutshell-ing myself: i'm califo...,POSITIVE,0.998791
2,49231,25,single,f,straight,curvy,mostly anything,socially,,working on college/university,...,0.057827,0.037907,0.028851,0.026004,0.034496,0.017207,0.012556,"my name is katie - i grew up in loomis, ca - l...",POSITIVE,0.999646
3,44964,56,single,f,straight,average,anything,socially,never,,...,0.027218,0.015264,0.010547,0.21872,0.008975,0.007405,0.006818,"i'm independent, confident and self-sufficient...",POSITIVE,0.999783
4,41515,25,single,f,straight,athletic,mostly other,socially,never,working on college/university,...,0.252955,0.000711,0.001258,0.143121,0.119392,0.004099,0.000282,"art, family, music, honest, friends, hate dram...",POSITIVE,0.99957
5,39287,27,single,m,straight,athletic,,socially,never,graduated from college/university,...,0.025607,0.013645,0.009025,0.013625,0.740993,0.012322,0.011627,"i'm a scientist, a financier, and an athlete. ...",POSITIVE,0.999583
6,41044,30,single,f,gay,curvy,,socially,sometimes,graduated from college/university,...,0.025175,0.031794,0.03354,0.012664,0.011132,0.657713,0.01532,"i like this quote from devilicia:""she is the c...",POSITIVE,0.996552
7,37621,62,single,f,straight,fit,,socially,never,college/university,...,0.033626,0.037356,0.027265,0.031049,0.028208,0.524672,0.019277,"i have been distracted by the economy, recentl...",NEGATIVE,0.923569
8,14971,40,single,m,straight,average,,socially,never,,...,0.081742,0.06615,0.046778,0.071214,0.080555,0.051302,0.028376,"i try not to sweat the small stuff, life is to...",POSITIVE,0.99843
9,30813,28,single,m,gay,average,mostly anything,socially,never,graduated from college/university,...,0.071772,0.096163,0.044873,0.050686,0.061501,0.035486,0.030176,i don't think i can summarize myself. there ar...,NEGATIVE,0.893676


In [102]:
# Test it out!

rank_matches(probabilities.loc[8,:], pref_gender = "f")

[(0, 0.9957047022899698),
 (4, 0.9528308092531714),
 (3, 0.9100180614878163),
 (2, 0.8981551340086066),
 (6, 0.856761621389742),
 (1, 0.8331191346904884),
 (5, 0.7973132590420293)]