In [None]:
import pandas as pd
import numpy as np
import string
import os
from pprint import pprint
import itertools
import math 

# Configuration Parameters

In [None]:
HOME_DIR = ''
os.chdir(HOME_DIR)

DATE_TAG = '12_12'     # responses for this week should be named "responses_{DATE_TAG}"
CUTOFF = '2020/12/01'  # a date after last week's responses and before this week's responses

# Load this week's data

In [None]:
pd.set_option('max_colwidth', 100)

df = pd.read_csv(f'data/responses/responses_{DATE_TAG}.csv',
                 parse_dates=['Timestamp'])
df = df[df['Timestamp'] > CUTOFF]

In [None]:
df[:5]

In [None]:
df.columns

In [None]:
df = df.rename({
    'Timestamp': 'timestamp',
    'Username': 'email',
    'Name': 'name',
    'Pronouns': 'pronouns',
    'SCS Department': 'department',
    'What year are you in?': 'yr',
    'What times would you be able to meet? (Pittsburgh time) [Morning]': 'morning_times',
    'What times would you be able to meet? (Pittsburgh time) [Afternoon]': 'afternoon_times',
    'What times would you be able to meet? (Pittsburgh time) [Evening]': 'evening_times',
    'If none of the above times work for you, what time zone are you in? (ET, PT, etc.)': 'time_zone',
    'Where would you want to be able to meet?': 'where_to_meet',
    'How would you like to be matched?': 'group_size',
    'What kind of interaction are you after this week?': 'interaction_type',
    'Hobbies/Interests': 'hobbies',
    'Hobby Categories': 'hobby_categories',
    'Anything else you want us to know for matching purposes?': 'other_friends',
    'Research topics/interests': 'research_interests', 
    'Topics': 'research_categories',
    'Anything else you want us to know for matching purposes?.1': 'other_research',
    'Would you like to be a mentor and/or mentee?': 'mentor_vs_mentee', 
    'Background': 'background',
    'Cultural background and Identity': 'culture',
    'Anything else you want us to know for matching purposes?.2': 'other_mentor',
    'Anything else you want us to know for matching purposes?.3': 'other_random',
    'Can we include your answers to *this form* in aggregate statistics that we publish? Regardless of your answer, we will never share your individual form answers with anyone.': 'consent',
    'Can we include your answers to *previous coffee chat forms* in aggregate statistics that we publish? Regardless of your answer, we will never share your individual form answers with anyone.': 'conset_prev',
}, axis=1)

df[:3]

In [None]:
# breakdown of interaction types this week
print(f'Total responses: {len(df)}')
df.groupby('interaction_type').count()['name']

# Load in previous matches

In [None]:
prev_pairs = []

for f in os.listdir('data/matches/'):
    if f.startswith('matched_pairs_') and not f.endswith(f'_{DATE_TAG}.csv'):
        previous_data_file = 'data/matches/' + f
        print(previous_data_file)
        prev_matches = pd.read_csv(previous_data_file)
        print(len(prev_matches))
        prev_matches = prev_matches[['Email 1', 'Email 2', 'Email 3 (if applicable)']]
        prev_matches = prev_matches.values.tolist()

        for m in prev_matches:
            for combo in itertools.combinations(m, 2):
                assert(len(combo) == 2)
                if (type(combo[0]) == float and np.isnan(combo[0])) or (type(combo[1]) == float and np.isnan(combo[1])):
                    continue
                prev_pairs.append(combo)

# Helper functions

In [None]:
# pre-processing

def clean_val(val, none_val=''):
    if (val is None) or (type(val) == float and np.isnan(val)):
        return none_val
    return val


def combine_times(d):
    times = list(['{} Morning'.format(t) for t in d['morning_times']])
    times += list(['{} Afternoon'.format(t) for t in d['afternoon_times']])
    times += list(['{} Evening'.format(t) for t in d['evening_times']])
    return times


list_fields = [
    'department',
    'morning_times',
    'afternoon_times',
    'evening_times',
    'where_to_meet',
    'hobby_categories',
    'research_categories',
    'culture'
]


def df_to_dicts(cohort_df, relevant_fields, custom_cleaners={}):  
    # note: if using a custom cleaner, make sure it's not in list_fields
    people = []
    for (i, row) in cohort_df[relevant_fields].iterrows():
        d = dict(zip(relevant_fields, row.tolist()))
        for k in d:
            if k in list_fields:
                d[k] = [v for v in clean_val(d[k]).split(';') if len(v) > 0]
            else:
                val = clean_val(d[k])
                if k in custom_cleaners:
                    val = custom_cleaners[k](val)
                d[k] = val
        d['times'] = combine_times(d)  # combine morning, afternoon, and evening
        people.append(d)
    return people

In [None]:
## scoring

def get_general_match_score(p1, p2):  
    """Initial matching score. 
    
    Based on previous pairs, timing, location, and group size.
    """
    score = 0
    reasons = []

    # decrease score if previously met
    if ((p1['email'], p2['email']) in prev_pairs) or ((p2['email'], p1['email']) in prev_pairs):
        score -= 15
    
    # based on timing    
    common_times = set(p1['times']).intersection(set(p2['times']))
    if len(common_times) > 0:
        score += 2
        reasons.append(['common time', common_times])
    
    # based on location
    wh1 = p1['where_to_meet']
    wh2 = p2['where_to_meet']
    if wh1 is None:
        wh1 = ['Over Zoom', 'In-person in Pittsburgh (physically distant and outside)']
    if wh2 is None:
        wh2 = ['Over Zoom', 'In-person in Pittsburgh (physically distant and outside)']
    common_place = set(wh1).intersection(set(wh2))
    if len(common_place) > 0:
        score += 1
        if len(wh1) == 1 and len(wh2) == 1:
            score += 0.5
        reasons.append(['common place', common_place])
    else:
        score -= 3
    
    # based on group size
    if (p1['group_size'] == 'In a pair') and (p2['group_size'] == 'In a pair'):
        score += 2

    return score, reasons


def get_combo_scores(people, match_score_fn, topics_ct, 
                     triple_bonus_fn=None, quad_bonus_fn=None, 
                     compute_quads=False):
    pair_scores = []
    for (i, p1) in enumerate(people):
        for (j, p2) in enumerate(people):
            if not (i < j):
                continue
            
            s, reasons = match_score_fn(p1, p2, topics_ct)
            
            if p1['group_size'] == 'In a group of 3-4' or p2['group_size'] == 'In a group of 3-4':
                s -= 5
            if p1['group_size'] == 'No preference' and p2['group_size'] == 'No preference':
                s -= 1
            
            pair_scores.append([(i, j), s])

    triple_scores = []
    for (i, p1) in enumerate(people):
        for (j, p2) in enumerate(people):
            for (k, p3) in enumerate(people):
                if not (i < j and j < k):
                    continue

                s12, reasons12 = match_score_fn(p1, p2, topics_ct)
                s23, reasons23 = match_score_fn(p2, p3, topics_ct)
                s13, reasons13 = match_score_fn(p1, p3, topics_ct)
                s = (s12 + s23 + s13) / 2.5
                
                if triple_bonus_fn is not None:
                    s += triple_bonus_fn(p1, p2, p3)

                if 'In a pair' in set([p1['group_size'], p2['group_size'], p3['group_size']]):
                    s -= 6
                if 'In a group of 3-4' in set([p1['group_size'], p2['group_size'], p3['group_size']]):
                    s += 2
                
                triple_scores.append([(i, j, k), s])

    if not compute_quads:
        return pair_scores + triple_scores

    quad_scores = []
    for (i, p1) in enumerate(people):
        for (j, p2) in enumerate(people):
            for (k, p3) in enumerate(people):
                for (l, p4) in enumerate(people):
                    if not (i < j and j < k and k < l):
                        continue

                    s12, reasons12 = match_score_fn(p1, p2, topics_ct)
                    s13, reasons13 = match_score_fn(p1, p3, topics_ct)
                    s14, reasons14 = match_score_fn(p1, p4, topics_ct)
                    s23, reasons23 = match_score_fn(p2, p3, topics_ct)
                    s24, reasons24 = match_score_fn(p2, p4, topics_ct)
                    s34, reasons34 = match_score_fn(p3, p4, topics_ct)
                    s = (s12 + s13 + s14 + s23 + s24 + s34) / 5
                    
                    if quad_bonus_fn is not None:
                        s += quad_bonus_fn(p1, p2, p3, p4)

                    if 'In a pair' in set([p1['group_size'], p2['group_size'], p3['group_size'], p4['group_size']]):
                        s -= 6
                    if 'In a group of 3-4' in set([p1['group_size'], p2['group_size'], p3['group_size'], p4['group_size']]):
                        s += 2
                    
                    quad_scores.append([(i, j, k, l), s])
    return pair_scores + triple_scores + quad_scores

# Mentorship Matching (supplemented w/ Random)

In [None]:
mentorship = df[df['interaction_type'] == 'PhD mentorship']

pd.set_option('max_colwidth', 200)

# look at special requests
mentorship[['mentor_vs_mentee', 'department', 'yr', 'group_size', 'other_mentor', 'other_random']]

In [None]:
# check whether we have enough mentors. otherwise, extract mentors from random

mentees = mentorship[mentorship['mentor_vs_mentee'] == 'Mentee']
mentors = mentorship[mentorship['mentor_vs_mentee'] == 'Mentor']

print(f'{len(mentees)} mentees, {len(mentors)} mentors')

def get_department_count(mdf):
    m_dep = mdf.groupby('department').count()[['name']].rename({'name': 'ct'}, axis=1)
    m_dep = m_dep.to_dict('index')
    new_d = {}
    for md in m_dep:
        for m in md.split(';'):
            new_d[m] = m_dep[md]['ct']
    return new_d

# get mapping of department to number of people
mentee_dep = get_department_count(mentees)
mentor_dep = get_department_count(mentors)

# number of additional mentors needed per department
scarcity = {}
for dep in mentee_dep:
    while mentor_dep.get(dep, 0) > 0:
        mentor_dep[dep] -= 1
        mentee_dep[dep] -= 1

    if mentee_dep[dep] > 0:
        scarcity[dep] = scarcity.get(dep, 0) + mentee_dep[dep]
        
print('Need more mentors in these departments:')
pprint(scarcity)

In [None]:
random = df[df['interaction_type'].isin(['Random', 'Random/Other'])]

# get potential mentors
pot_m = random
pot_m = pot_m[pot_m['yr'].replace({'6+': 6}).astype(int) > 1]
pot_m = pot_m[pot_m['department'].isin(list(scarcity.keys()))]

# look at special requests
pot_m[['department', 'yr', 'group_size', 'other_mentor', 'other_random']]

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
relevant_fields = [
    'email', 'name', 
    'department', 'yr', 'background',
    'morning_times', 'afternoon_times', 'evening_times', 
    'where_to_meet', 
    'group_size',
    'mentor_vs_mentee',
    'culture',
    'other_mentor',
    'other_random',
]
mentorship_combined = pd.concat([mentorship, pot_m], axis=0)
mentorship_people = df_to_dicts(mentorship_combined, relevant_fields)  # list of dicts

topics_ct = {}
for p in mentorship_people:
    for cat in p['culture'] + [p['background']]:
        if len(cat) == 0:
            continue
        topics_ct[cat] = topics_ct.get(cat, 0) + 1
    
topics_ct

In [None]:
def get_mentorship_match_score(p1, p2, topics_ct):
    score, reasons = get_general_match_score(p1, p2)
    
    # based on hobbies
    topic1 = set(p1['culture'] + [p1['background']])
    topic2 = set(p2['culture'] + [p2['background']])
    common_topics = topic1.intersection(topic2)
    reasons.append(['common topics', common_topics])
    for ci in common_topics:
        if len(ci) == 0:
            continue
        if topics_ct[ci] <= 4:  # if rare topic, increase score
            score += 4
        else:
            score += 2
    score += 2 * len(common_topics)

    if set([p1['mentor_vs_mentee'], p2['mentor_vs_mentee']]) == set(['Mentee', 'Mentor']):
        score += 15
    
    if not (p1['mentor_vs_mentee'] == 'Mentee' and p2['mentor_vs_mentee'] == 'Mentee'):
        dep1 = set(p1['department'])
        dep2 = set(p2['department'])
        if len(dep1.intersection(dep2)) > 0:
            score += 7
        
        mentee_p = p1 if p1['mentor_vs_mentee'] == 'Mentee' else p2
        mentor_p = p2 if p1['mentor_vs_mentee'] == 'Mentee' else p1  # mentor is one that's not mentee

        if mentor_p['yr'] > mentee_p['yr']:
            score += 5

    return score, reasons

In [None]:
print(topics_ct)

def mentorship_bonus(*args):
    bonus = 0
    roles = [r['mentor_vs_mentee'] for r in args]

    role_cts = {}
    for role in roles:
        role_cts[role] = role_cts.get(role, 0) + 1
    
    # design choice to prefer 2 mentees in same group
    if role_cts.get('Mentee', 0) > 1:
        bonus += 3
    
        if role_cts.get('Mentor', 0) > 1:
            bonus += 1
    return bonus

all_mentorship_scores = get_combo_scores(mentorship_people, 
                                         get_mentorship_match_score, 
                                         topics_ct, 
                                         triple_bonus_fn=mentorship_bonus,
                                         quad_bonus_fn=mentorship_bonus,
                                         compute_quads=True)
all_mentorship_scores = list(reversed(sorted(all_mentorship_scores, key=lambda x: x[1])))
print(len(all_mentorship_scores))
all_mentorship_scores[:10]

In [None]:
mentorship_matches = []
mentorship_matched_people = set([])
for idx, s in all_mentorship_scores:
    assert(len(idx) in [2, 3, 4])
    
    if len(mentorship_matched_people.intersection(set(list(idx)))) > 0:
        continue
        
    st = '{}\nscore: {}\n\n'.format(idx, s)
    for (_, i) in enumerate(list(idx)):
        mentorship_matched_people.add(i)
        p = mentorship_people[i]
        st += f"""
          P{_}: {p['name']}
          role: {p['mentor_vs_mentee']}
          department, year: {p['department']}, {p['yr']}
          topic: {p['culture'] + [p['background']]}
          how: {p['group_size']}
          other: {p['other_mentor'], p['other_random']}\n\n"""            
    print(st)
    mentorship_matches.append(idx)
    if len(idx) == 2:
        pprint(get_mentorship_match_score(mentorship_people[idx[0]], mentorship_people[idx[1]], topics_ct)[1])
    print('------------------------')

In [None]:
print('Remaining unmatched mentors/mentees: ', list([mentorship_people[i] for i in range(len(mentorship_people)) if i not in mentorship_matched_people]))

In [None]:
mentorship_group_emails = []
for grp in mentorship_matches:
    emails = list([mentorship_people[i]['email'] for i in grp])
    mentorship_group_emails.append(emails)
mentorship_group_emails

# Research

In [None]:
researchers = df[df['interaction_type'] == 'Research topic']
researchers[:5]

In [None]:
relevant_fields = [
    'email', 'name', 
    'morning_times', 'afternoon_times', 'evening_times', 
    'where_to_meet', 
    'group_size',
    'research_interests', 'research_categories',
    'other_research'
]
research_people = df_to_dicts(researchers, relevant_fields)  # list of dicts

topics_ct = {}
for p in research_people:
    for cat in p['research_categories']:
        if len(cat) == 0:
            continue
        topics_ct[cat] = topics_ct.get(cat, 0) + 1

topics_ct

In [None]:
def get_research_match_score(p1, p2, topics_ct):
    score, reasons = get_general_match_score(p1, p2)
    
    # based on hobbies
    topic1 = set(p1['research_categories'])
    topic2 = set(p2['research_categories'])
    common_topics = topic1.intersection(topic2)
    reasons.append(['common topics', common_topics])
    for ci in common_topics:
        if topics_ct[ci] <= 5:  # if rare topic, increase score
            score += 4
        else:
            score += 2
    score += 2 * len(common_topics)

    return score, reasons

In [None]:
all_researchers_scores = get_combo_scores(research_people, get_research_match_score, topics_ct)
all_researchers_scores = list(reversed(sorted(all_researchers_scores, key=lambda x: x[1])))
print(len(all_researchers_scores))
all_researchers_scores[:10]

In [None]:
research_matches = []
research_matched_people = set([])
for idx, s in all_researchers_scores:
    assert(len(idx) in [2, 3])
    
    if len(research_matched_people.intersection(set(list(idx)))) > 0:
        continue
        
    st = '{}\nscore: {}\n\n'.format(idx, s)
    for (_, i) in enumerate(list(idx)):
        research_matched_people.add(i)
        p = research_people[i]
        st += 'P{}: {}\ntopic: {}\nhow: {}\nother: {}\n\n'.format(
            _, p['name'], p['research_interests'], p['group_size'], p['other_research'])
    print(st)
    research_matches.append(idx)
    if len(idx) == 2:
        pprint(get_research_match_score(research_people[idx[0]], research_people[idx[1]], topics_ct)[1])
    print('------------------------')

In [None]:
print('Remaining unmatched researchers: ', list([research_people[i] for i in range(len(research_people)) if i not in research_matched_people]))

In [None]:
research_group_emails = []
for grp in research_matches:
    emails = list([research_people[i]['email'] for i in grp])
    research_group_emails.append(emails)
research_group_emails

# Friendship Matching

In [None]:
friends = df[df['interaction_type'] == 'Friendship outside of work']
friends[:5]

### Extract cleaner hobby tokens from freetext

In [None]:
def get_clean_hobbies(hobby_freetext):
    hobbies = hobby_freetext.replace('\n', ',').replace(':', ',').replace(';', ',')
    hobbies = hobbies.translate(str.maketrans(string.punctuation.replace('-', ''), ',' * (len(string.punctuation) - 1), ''))
    hobbies = hobbies.split(',')
    hobs = [h.lower().strip() for h in hobbies]

    hobby_lookup = {
        'geopolitics': 'politics',
        'foodie': 'food',
        'movie buff': 'movies',
        'painting': 'art',
        'exploring cities': 'traveling',
        'exploring': 'traveling',
        'transit': 'traveling',
        'geography': 'history',
        'japanese language': 'languages',
        'gaming': 'video games',
        'playing music': 'music',
        'physical fitness': 'fitness',
        'exercise': 'fitness',
        'swimming': 'fitness',
        'super smash bros melee': 'video games',
        'chess': 'board games',
        'want to get involved in volunteering': 'volunteering',
        'wine': 'alcohol',
        'travel': 'traveling',
        'FIRST': ['stem', 'robotics'],
        'musicviolin': 'music',
        'entrepreneurship': 'startups',
        'philosophy': 'humanities',
        'coffee': 'beverages',
        'math': 'stem',
        'roller skating': 'playing sports',
        'reptiles': 'animals',
        'visual art': 'art',
        'economics': 'humanities',
        'tech': 'stem',
        'piano': 'music',
        'card game': 'board games',
        'algorithm': 'stem',
        'learning languages': 'languages',
        'classical music': 'music',
        'reading history': 'history',
        'ice skating': 'playing sports',
        'practicing italian': 'languages',
        'watching tv': 'tv',
        'tv shows': 'tv',
        'breweries': 'alcohol',
        'wineries': 'alcohol',
        'occasional visits adventure sports and theme parks': 'playing sports',
        'watching movies': 'movies',
        'singing': 'music',
        'listening to podcasts and music': 'podcasts/audiobooks',
        'diy handyman': 'diy',
        'backpacking': 'outdoors',
        'outdoor adventures': 'outdoors',
        'guitar': 'music',
        'surfing': 'playing sports',
        'general banter': 'conversation',
        'procrastinating by reading about procrastination': 'productivity',
        'productivity hacks': 'productivity',
        'cocktail making': 'alcohol',
        'watching avatar on netflix': 'tv',
        'action-adventure': 'movies',
        'moviesmystery': 'movies',
        'lazy cooking': 'cooking',
        'violin': 'music',
        'performance modelling': 'stem',
        'cycling': 'biking',
        'resource management': 'productivity',
        'scheduling': 'productivity',
        'networks': 'stem',
        'and animal facts': 'animals',
        'memes': 'comedy',
        'musical instruments': 'music',
        'social justice': 'volunteering',
        'playing violin': 'music',
        'vegetables': 'food',
        'bread': 'food',
        'tea': 'beverages',
        'fermenting things': 'cooking',
        'policy': 'politics',
        'verification': 'stem',
        'emacs': 'stem',
        'systems': 'stem',
        'photography': 'art',
        'i play drums': 'music',
        'climbing': 'climbing',
        'drawing': 'art',
        'running': 'running',
        'jogging': 'running',
        'learning about pittsburgh': 'pittsburgh',
        'watching sports in quarantine': 'watching sports',
        'ultimate frisbee': 'playing sports',
        'want to start biking': 'biking',
        'politics': 'politics',
        'playing music': 'music',
        'violin': 'music',
        'whiskey': 'alcohol',
        'adventure sports': 'playing sports',
        'stand-up': 'comedy',
        'tv': 'tv',
        'films': 'movies',
        'improv': ['improv', 'comedy'],
        'pets': 'animals',
        'video games': 'video games',
        'sports': 'watching sports',
        'gbbo': ['baking', 'cooking'],
        'npr one': 'podcasts/audiobooks',
        'audiobooks': 'podcasts/audiobooks',
        'rock climbing': ['climbing', 'playing sports'],
        'badminton': 'playing sports',  # week 8/28
        'mystery': 'mystery',
        'currently watching legend of korra on netflix': 'tv',
        'formula1': 'watching sports',
        'playing tennis': ['tennis', 'playing sports'],
        'things to do in pittsburgh': 'pittsburgh',
        'podcasts': 'podcasts/audiobooks',
        'watching musicals': ['musicals', 'music'],
        'reading books': 'reading',
        'fiction stories and things about grammar':'reading',
        'eating ice cream':'food',
        'writing':'reading',
        'scrolling through twitter':'twitter',
        'space':'stem',
        'rocks':'stem',
        'roofs':'diy',
        'the great british bake off':['baking', 'cooking'],
        'marvel':'tv',
        'agents of shield':'tv',
        'gym':'fitness',
        'bouldering':'climbing',
        'finding creative ways to exercise in quarantine':'fitness',
        'sci-fi': ['sci-fi', 'stem'],
        'basketball': 'playing sports',
        'jazz rnb and foreign music': 'music',
        'books': 'reading',
        'learning about pittsburgh area': 'pittsburgh',
        'dinosaurs': 'animals',
        'cooking and experimenting with different cuisines': 'cooking',
        'taking walks': 'outdoors',
        'general banter and discussions': 'conversation',
        'textiles': 'diy',
        'knitting': 'diy',
        'crocheting': 'diy',
        'sewing': 'diy',
        'mechanical engineering': 'stem',
        'experimental music': 'music',
        'listening to music and podcasts': 'podcasts/audiobooks',
        'white water rafting': 'outdoors',
        'mentoring others': 'volunteering',
        'advocating for social justice': 'volunteering',
        'skiing': 'playing sports',
        'snowboarding': 'playing sports',
        'outdoor activities': 'outdoors',
        'language learning': 'languages',
        'digital art': 'art',
        'christian faith': 'christian',
        'board game': 'board games',
        'go': 'board games',
        'about pittsburgh area': 'pittsburgh',
        'watching legend of korra on netflix': 'tv',
        'tennis': ['tennis', 'playing sports'],
        'soccer': ['soccer', 'playing sports', 'watching sports'],
        'walking': ['pittsburgh', 'hiking'],
        'hiking': ['hiking', 'outdoors'],
        'camping': ['hiking', 'outdoors'],
        'videogames': 'video games',
        'outer space': 'stem',
        'robotics': ['stem', 'robotics'],
        'grammar': ['languages', 'reading'],
        'musicals': ['musicals', 'music'],
        'science fiction': 'sci-fi',
        'vinyl records': 'music',
        'fitness': ['fitness', 'playing sports'],
        'web serial fiction': ['reading', 'fiction', 'anime'],
        'anime': 'anime',
        'walking in parks': ['walking', 'pittsburgh', 'outdoors'],
        'bunny watching': ['outdoors', 'pittsburgh'],
        'recreational programming': 'stem',
        'spanish': ['spanish', 'languages'],
        'queer cinema': ['movies', 'lgbtq'],
        'fantasy books': ['fantasy', 'reading'],
        'dei': ['volunteering', 'lgbtq'],
        'comedy improv': ['improv', 'comedy'],
        'origami': ['origami', 'art'],
        'ping-pong': ['ping-pong', 'playing sports'],
        'general discussions': 'conversation',
        'dance': ['dance', 'music'],
        'story books': ['reading'],
        'light-hearted discussions': 'conversation',
        'trivia': ['trivia', 'board games'],
        'exploration of pittsburgh': ['pittsburgh'],
        'frisbee': ['frisbee', 'watching sports']
    }
    
    stopwords = [
        'academic',
        'currently',
        'esp',
        'learning',
        'etc',
    ]
    clean = []
    for h in hobs:
        h = h.strip().lower()
        for s in stopwords:
            h = h.replace(s, '')   
            h = h.strip()
        
        if len(h) == 0:
            continue
            
        h = hobby_lookup.get(h, h)
        if type(h) == str:
            clean.append(h)
        else:
            assert(type(h) == list)            
            clean += h
            
        
    clean = [c for c in clean if len(c) > 0]
    clean = list(set(clean))
    return clean


### Compute friendship scores

In [None]:
relevant_fields = [
    'email', 'name', 
    'morning_times', 'afternoon_times', 'evening_times', 
    'where_to_meet', 
    'group_size',
    'hobbies', 
    'hobby_categories',
    'other_friends'
]
friend_people = df_to_dicts(friends, relevant_fields, custom_cleaners={'hobbies': get_clean_hobbies})  # list of dicts
friend_people[0]
topics_ct = {}
for p in friend_people:
    for cat in (p['hobbies'] + p['hobby_categories']):
        if len(cat) == 0:
            continue
        topics_ct[cat] = topics_ct.get(cat, 0) + 1

topics_ct

In [None]:
def get_friendship_match_score(p1, p2, topics_ct):
    score, reasons = get_general_match_score(p1, p2)
    
    # based on hobbies
    topic1 = set(p1['hobbies'] + p1['hobby_categories'])
    topic2 = set(p2['hobbies'] + p2['hobby_categories'])
    common_topics = topic1.intersection(topic2)
    reasons.append(['common topics', common_topics])
    for ci in common_topics:
        if topics_ct[ci] <= 5:  # if rare topic, increase score
            score += 4
        else:
            score += 2
    score += 2 * len(common_topics)

    return score, reasons

In [None]:
all_friend_scores = get_combo_scores(friend_people, get_friendship_match_score, topics_ct)
all_friend_scores = list(reversed(sorted(all_friend_scores, key=lambda x: x[1])))

In [None]:
friend_matches = []
friend_matched_people = set([])
for idx, s in all_friend_scores:
    assert(len(idx) in [2, 3])
    
    if len(friend_matched_people.intersection(set(list(idx)))) > 0:
        continue
        
    st = '{}\nscore: {}\n\n'.format(idx, s)
    for (_, i) in enumerate(list(idx)):
        friend_matched_people.add(i)
        p = friend_people[i]
        st += 'P{}: {}\ntopic: {}\nhow: {}\nother: {}\n\n'.format(
            _, p['name'], p['hobbies'] + p['hobby_categories'], p['group_size'], p['other_friends'])
    print(st)
    friend_matches.append(idx)
    if len(idx) == 2:
        pprint(get_friendship_match_score(friend_people[idx[0]], friend_people[idx[1]], topics_ct)[1])
    print('------------------------')

In [None]:
print('Remaining unmatched friends: ', list([friend_people[i] for i in range(len(friend_people)) if i not in friend_matched_people]))

In [None]:
friend_group_emails = []
for grp in friend_matches:
    emails = list([friend_people[i]['email'] for i in grp])
    friend_group_emails.append(emails)
friend_group_emails

# Final matches

In [None]:
matches = mentorship_group_emails + research_group_emails + friend_group_emails
matches

In [None]:
# get the remaining unmatched people
pd.set_option('max_colwidth', 500)
pd.set_option('display.width', 20000)


def combine_other_cols(row):
    friends = clean_val(row['other_friends'])
    research = clean_val(row['other_research'])
    mentorship = clean_val(row['other_mentor'])
    random = clean_val(row['other_random'])
    other = ''.join([friends, research, mentorship, random])
    return other

tempdf = df.set_index('email')
for grp in matches:
    grpdf = tempdf.loc[grp, :].reset_index().set_index('name')
    grpdf['other'] = grpdf.apply(combine_other_cols, axis=1)
    grpdf = grpdf
    print('Other comments:')
    print(grpdf['other'])
    print('')
    print(grpdf[['email', 'interaction_type', 'department', 'yr', 'pronouns', 'group_size']])
    print('------------------------------------------------------------------')

In [None]:
# manual adjustments based on "other" field
# (copy/paste the full matches list and make adjustments manually)

In [None]:
# look at remaining unmatched
matched_email_set = []
for grp in matches:
    matched_email_set += grp
matched_email_set = set(matched_email_set)

remaining = df[~df['email'].isin(matched_email_set)]
print(len(remaining))

In [None]:
relevant_fields = [
    'email', 'name', 
    'morning_times', 'afternoon_times', 'evening_times', 
    'where_to_meet', 
    'group_size',
    'other_mentor',
    'other_research',
    'other_friends',
    'other_random'
]
remaining_people = df_to_dicts(remaining, relevant_fields)  # list of dicts


In [None]:
all_remaining_scores = get_combo_scores(
    remaining_people, lambda x, y, z: get_general_match_score(x, y), {})
all_remaining_scores = list(reversed(sorted(all_remaining_scores, key=lambda x: x[1])))
print(len(all_remaining_scores))
all_remaining_scores[:10]

In [None]:
remaining_matches = []
remaining_matched_people = set([])
for idx, s in all_remaining_scores:
    assert(len(idx) in [2, 3])
    
    if len(remaining_matched_people.intersection(set(list(idx)))) > 0:
        continue
        
    st = '{}\nscore: {}\n\n'.format(idx, s)
    for (_, i) in enumerate(list(idx)):
        remaining_matched_people.add(i)
        p = remaining_people[i]
        st += 'P{}: {}\nhow: {}\nother: {}\n\n'.format(
            _, p['name'], p['group_size'], p['other_random'])
    print(st)
    remaining_matches.append(idx)
    if len(idx) == 2:
        pprint(get_general_match_score(remaining_people[idx[0]], remaining_people[idx[1]])[1])
    print('------------------------')



In [None]:
print('Remaining unmatched: ', list([remaining_people[i] for i in range(len(remaining_people)) if i not in remaining_matched_people]))

In [None]:
remaining_group_emails = []
for grp in remaining_matches:
    emails = list([remaining_people[i]['email'] for i in grp])
    remaining_group_emails.append(emails)
remaining_group_emails

In [None]:
final_matches = mentorship_group_emails + research_group_emails + friend_group_emails + remaining_group_emails
print(sum([len(l) for l in final_matches]))

In [None]:
print('============================ FINAL MATCHES ============================')
tempdf = df.set_index('email')
for grp in final_matches:
    grpdf = tempdf.loc[grp, :].reset_index().set_index('name')
    grpdf['other'] = grpdf.apply(combine_other_cols, axis=1)
    grpdf = grpdf
    print('Other comments:')
    print(grpdf['other'])
    print('')
    print(grpdf[['email', 'interaction_type', 'department', 'yr', 'pronouns', 'group_size']])
    print('------------------------------------------------------------------')

# Export matches to csv

In [None]:
e_dict = {}
for grp in final_matches:
    for i in range(4):
        if i <= len(grp) - 1:
            e_dict[i] = e_dict.get(i, []) + [grp[i]]
        else:
            e_dict[i] = e_dict.get(i, []) + [np.nan]
    
all_groups = pd.DataFrame({
    'Email 1': e_dict[0],
    'Email 2': e_dict[1],
    'Email 3 (if applicable)': e_dict[2],
    'Email 4 (if applicable)': e_dict[3],
})

assert(len(df) == all_groups.notna().sum().sum())

all_groups

In [None]:
save_path = f'data/matches/matched_pairs_{DATE_TAG}.csv'
if not os.path.exists(save_path):
    all_groups.to_csv(save_path, index=False)
    print('saved!')