In [1]:
import pandas as pd
import numpy as np
import torch
from torchtext.vocab import GloVe
from collections import defaultdict
from scipy.spatial.distance import cosine


In [3]:
#read in the output from classifier and rename columns for clarity
probabilities = pd.read_csv('classifier/classifier_outputs.csv', index_col=0).rename(columns = {'drugs': 'drugs_uses',
                                                                                                'drugs.1': 'drugs'})

In [4]:
# Let's have a look
probabilities.columns

Index(['index', 'age', 'status', 'sex', 'orientation', 'body_type', 'diet',
       'drinks', 'drugs_uses', 'education', 'ethnicity', 'height', 'income',
       'job', 'location', 'offspring', 'pets', 'religion', 'sign', 'smokes',
       'speaks', 'essay0', 'food', 'kids', 'travel', 'drama', 'music', 'TV',
       'comedies', 'movies', 'drinking', 'books', 'drugs', 'sentiment_label',
       'sentiment_score'],
      dtype='object')

In [5]:
# Read in glove embeddings. The reason we're using glove embeddings is because not all interests are equally different.
# For example, drinking and food are more similar than drama and food, so we want to account for that semantic similarity.
glove = GloVe(name = "6B", dim = 50)

In [6]:
names = list(probabilities.columns[22:33])

In [7]:
# Create vocab for each possible interest
vocab = defaultdict(lambda: glove['unknown'])  # Default to 'unknown' vector for missing words
for name in names:
    words = name.split()
    vectors = [glove[word.lower()] for word in words if word.lower() in glove.stoi]
    
    # Average the vectors for words in the name if it's a compound, else just use the vector
    if vectors:
        vocab[name] = torch.mean(torch.stack(vectors), dim=0)
    else:
        vocab[name] = glove['unknown']  # Use 'unknown' vector if none of the words in the name are in GloVe

In [8]:
# Function for weighting the embedding vectors based on the input's probability of falling into that interest
def get_weighted_vectors(row, vocab=vocab):
    weighted_vectors = []
    for col in names:
        if col in vocab:
            weighted_vector = row[col] * vocab[col]
            weighted_vectors.append(weighted_vector)
    
    return sum(weighted_vectors)

In [9]:
# Function for returning a list of cosine similarities between user interests and all possible matches' interests
def compute_cosine_similarity(target_vector, vectors):
    similarities = []
    for vector in vectors:
        similarity = 1 - cosine(target_vector, vector)  # 1 - cosine distance to get cosine similarity
        similarities.append(similarity)
    return similarities
    

In [30]:
# Putting it all together!
def rank_matches(input_probabilities, pref_gender=False, pref_age_lower=False, pref_age_higher=False):
    # First, let's filter the dataframe so we're only dealing with possible matches. For now, we're only looking at
    # the user's gender preference and age preference, but that can be expanded.
    #print(input_probabilities)
    df_possible = probabilities.copy()
    if pref_gender:
        df_possible = df_possible.loc[df_possible.loc[:,'sex'] == pref_gender, :]
    if pref_age_higher:
        df_possible = df_possible[df_possible.loc[:, "age"] <= pref_age_higher]
    if pref_age_lower:
        df_possible = df_possible[df_possible.loc[:, "age"] >= pref_age_lower]

    
    # Create weighted vectors for all observations in the dataset
    representation_vectors = df_possible.apply(lambda row: get_weighted_vectors(row), axis = 1)
    weighted_user = get_weighted_vectors(input_probabilities)
    # Compute the cosine similarity between the user's weighted embedding vector and all possible matches
    cosine_similarities = compute_cosine_similarity(weighted_user, representation_vectors)
    # Recover index to match back to original dataframe
    similarity_scores = [(index, score) for index, score in enumerate(cosine_similarities)]
    # Sort by similarity
    ranked_similarity = sorted(similarity_scores, key = lambda x: x[1], reverse = True)

    return ranked_similarity

In [11]:
probabilities

Unnamed: 0,index,age,status,sex,orientation,body_type,diet,drinks,drugs_uses,education,...,drama,music,TV,comedies,movies,drinking,books,drugs,sentiment_label,sentiment_score
0,32627,25,single,m,straight,fit,anything,very often,never,,...,0.063269,0.042128,0.039525,0.037574,0.029011,0.025359,0.023555,0.017707,POSITIVE,0.94839
1,33884,34,single,m,straight,average,mostly anything,socially,sometimes,graduated from masters program,...,0.016887,0.074945,0.012192,0.050511,0.013237,0.027108,0.03273,0.005736,POSITIVE,0.997833
2,50509,26,single,m,straight,athletic,mostly other,socially,never,graduated from two-year college,...,0.076248,0.136479,0.087189,0.10301,0.04671,0.081643,0.037309,0.022735,POSITIVE,0.983977
3,48305,41,single,m,gay,average,,socially,,graduated from ph.d program,...,0.168858,0.06926,0.089926,0.074984,0.047286,0.029727,0.121727,0.036615,POSITIVE,0.992873
4,28049,29,single,m,straight,athletic,mostly anything,often,,graduated from masters program,...,,,,,,,,,POSITIVE,0.999867
5,6440,29,single,f,straight,,strictly anything,rarely,,graduated from college/university,...,0.072007,0.072581,0.163226,0.079852,0.057335,0.047112,0.111011,0.030546,NEGATIVE,0.995419
6,42908,34,single,m,straight,athletic,mostly anything,socially,sometimes,graduated from ph.d program,...,0.221974,0.0322,0.031314,0.036303,0.01947,0.018199,0.059223,0.022158,POSITIVE,0.999723
7,7571,29,single,m,straight,a little extra,anything,socially,,graduated from college/university,...,0.003763,0.155575,0.003796,0.0022,0.001876,0.001531,0.003603,0.000765,POSITIVE,0.997934
8,37848,29,single,m,straight,athletic,,socially,never,graduated from masters program,...,0.027033,0.329929,0.013222,0.069344,0.109557,0.029009,0.00938,0.007743,,
9,49365,32,single,m,straight,average,mostly anything,socially,never,graduated from masters program,...,,,,,,,,,,


In [31]:
# Test it out!

ranked = rank_matches(probabilities.loc[8,:])#, pref_gender = "f")

In [35]:
top5_indices = pd.DataFrame([tup[0] for tup in ranked[:5]])

In [38]:
top5_indices.to_csv('top5_indices.csv')

In [36]:
top5_indices

Unnamed: 0,0
0,4
1,8
2,9
3,2
4,5
