In [2]:
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

categories = [
    "Computer Science", "Electrical Engineering", "Mechanical Engineering", "Civil Engineering", "Biomedical Engineering", "Environmental Science", "Mathematics", "Chemistry", "Physics", "Aerospace Engineering", "Data Science", "Information Technology", "Robotics", "Geology", "Biochemistry", "Education", "Environment", "Health and Wellness", "Social Justice", "Poverty Alleviation", "Community Development", "Humanitarian Aid", "Arts and Culture", "Elderly Care", "Animal Welfare", "LGBTQ+ Support", "Food Security", "Youth Empowerment", "Clean Water and Sanitation", "Disabilities Support"
]
# Load pre-trained BERT model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize and encode the categories
encoded_categories = [tokenizer(category, return_tensors='pt') for category in categories]

# Extract embeddings from BERT
category_embeddings = []

for category in encoded_categories:
    output = model(**category)
    embeddings = output.last_hidden_state[:, 0, :]
    category_embeddings.append(embeddings)

# Convert to torch tensors

category_embeddings = torch.cat(category_embeddings, dim=0)

# Calculate cosine similarity between categories

cosine_similarities = cosine_similarity(category_embeddings.detach().numpy())


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# Save top 5 most similar categories for each category without itself to json

similar_categories = {}

for i in range(len(categories)):

    # Sort the categories by similarity
    similar_categories[categories[i]] = [categories[x] for x in cosine_similarities[i].argsort()[-6:-1][::-1]]

# Save to json

import json

with open('similar_categories.json', 'w') as fp:
    json.dump(similar_categories, fp)







In [1]:
%pip install faker

Collecting faker
  Downloading Faker-19.11.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-19.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-19.11.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
# import card_data and generate a list of objects with a random name and random categories and random skills and interests under them
import faker
import random
import json

with open('card_data.json') as json_file:

    card_data = json.load(json_file)

# Generate a list of objects with a random name and random categories and random skills and interests under them

random_mentors = {}

for i in range(100):

    # Generate a random name
    name = faker.Faker().name()

    # Generate random categories
    categories = random.sample(list(card_data.keys()), 2)

    # Generate random skills and interests
    skills = []
    interests = []

    for category in categories:
        skills += random.sample(card_data[category]["skills"], 3)
        interests += random.sample(card_data[category]["interests"], 3)

    # Append to random_cards
    random_mentors[name] = {
        "categories": categories,
        "skills": skills,
        "interests": interests
    }

# Save to json

with open('random_mentors.json', 'w') as fp:

    json.dump(random_mentors, fp)




In [35]:
# given a list of skills/interests/categories, find the most similar mentor from random_mentors

# Load random_mentors

with open('random_mentors.json') as json_file:

    random_mentors = json.load(json_file)

sample_mentee=[
    "Computer Science",
    "Artificial Intelligence",
    "Cybersecurity",
    "Machine Learning",
    "Community Development",
    "Housing Development",
    "Infrastructure Improvement",
    "Affordable Housing",
    "Data Science",
    "Data Visualization",
    "Machine Learning",
    "Big Data Technologies",
    "Arts and Culture",
    "Art Therapy"
]



In [36]:


with open('card_data.json') as json_file:

    card_data = json.load(json_file)


everything = []

for category in card_data.keys():
    everything.append(category)
    everything += card_data[category]["skills"]
    everything += card_data[category]["interests"]







In [37]:
def create_vector(sample):

    sample_indices = []

    for element in sample:

        sample_indices.append(everything.index(element))

    # at each index, put 1

    sample_vector = [0] * len(everything)

    for index in sample_indices:

        sample_vector[index] = 1

    return sample_vector




In [41]:
for mentor in random_mentors:

    vector = create_vector(random_mentors[mentor]["categories"] + random_mentors[mentor]["skills"] + random_mentors[mentor]["interests"])
    random_mentors[mentor]["vector"]=vector

In [42]:
scores=[]

for mentor in random_mentors:

    score=cosine_similarity([create_vector(sample_mentee)], [random_mentors[mentor]["vector"]])[0][0]

    scores.append({"mentor": mentor,"score": score})

    

In [1]:
# get top 3 mentors

top5=sorted(scores, key=lambda x: x["score"], reverse=True)[:3]

for mentor in top5:
    mentor=random_mentors[mentor["mentor"]]
    print(mentor["categories"] + mentor["skills"] + mentor["interests"])

NameError: name 'scores' is not defined