In [20]:
import csv, re
from scipy.cluster.hierarchy import linkage
from UnionFind import *
from igraph import *

# Ideally, we would associate based on traits that we've consciously chose to develop
# Unideally, we would associate based on traits that we have no choice in or are merely incidental
# Realistically, it's some combination of both
STD_DEV = 5.0
WEIGHT_INTERESTS = {'ideal' : 1000, 'unideal': 0, 'real': STD_DEV}
WEIGHT_PRIMARY_MAJOR = {'ideal': 1000, 'unideal': 0, 'real': STD_DEV * 36.1926}
WEIGHT_SECONDARY_MAJORS = {'real': 1000, 'unideal': 0, 'real': STD_DEV * 10.0}
WEIGHT_GENDER = {'ideal': 0, 'unideal': 1000, 'real': STD_DEV}
WEIGHT_COURSES = {'ideal': 0, 'unideal': 1000, 'real': STD_DEV * 36.1926}
WEIGHT_LOCATION = {'ideal': 0, 'unideal': 100000, 'real': STD_DEV * 6911.85} # Divided by distance in feet

students = [student for student in csv.reader(open("students.csv"))]

In [21]:
def get_locaction_weight(bldg1, room1, bldg2, room2, case):
    """returns weight of location divided by distance"""
    weight = WEIGHT_LOCATION[case]
    bldg1, bldg2 = int(bldg1[0]), int(bldg2[0]) # Ignoring subdivided buildings, e.g. 7A & 7B
    if bldg1 != bldg2:
        return weight // 200
    elif room1 != room2:
        search1, search2 = re.search(r'\d+', room1), re.search(r'\d+', room2) # Ignoring Suites
        room1 = int(search1.group()) if search1 else None
        room2 = int(search2.group()) if search2 else None
        if room1 and room2:
            return weight // (abs(room1 // 10 - room2 // 10) * 10 + 10) # Same floor evaluates to 10
        else:
            return 100 # Same building defaults to 100
    else:
        return weight # same room
    
def get_major_weight(major1, othermajors1, major2, othermajors2):
    othermajors1, othermajors2 = set(othermajors1.split(", ")), set(othermajors2.split(", "))
    weight = 0
    weight += WEIGHT_PRIMARY_MAJOR if major1 == major2 else 0
    weight += len(othermajors1 & othermajors2) * WEIGHT_SECONDARY_MAJORS
    return weight

def get_courses_weight(courses1, courses2):
    courses1, courses2 = set(courses1), set(courses2)
    courses1.discard('')
    courses2.discard('')
    return len(courses1 & courses2) * WEIGHT_COURSES

In [22]:
idealweights = []
for v1, student in enumerate(students):
    for v2, peer in enumerate(students):
        if v2 <= v1:
            continue
        weight = 0
        weight -= get_major_weight(student[3], student[4], peer[3], peer[4])
        weight -=
        idealweights.append(weight) # Clustering based on lowest weight, which should correspond to highest distance

In [23]:
network = Graph.Full(len(students))
network.es["weight"] = weights
clustering = linkage(weights, 'average')

In [24]:
def get_memberships(clustering, numclusters=1):
    assert 0 < numclusters <= len(clustering)
    uf = UnionFind()
    numitems = len(clustering) + 1
    clustering = clustering[:-numclusters+1] if numclusters > 1 else clustering
    for i, operation in enumerate(clustering):
        cluster1, cluster2 = operation[0], operation[1]
        newcluster = numitems + i
        uf.union(cluster1, cluster2, newcluster)
    memberships = []
    group, foundgroups = 0, dict()
    for item in range(numitems):
        rep = uf[item]
        if rep not in foundgroups:
            foundgroups[rep] = group
            group += 1
        memberships.append(foundgroups[rep])
    return memberships

In [27]:
clusteredgraph = VertexClustering(network, get_memberships(clustering, 20))

In [15]:
preclustered = Plot()
preclustered.add(network)
preclustered.save('preclustered.png')

In [28]:
postclustered = Plot()
postclustered.add(clusteredgraph)
postclustered.save('postclustered.png')