### Binning data into 5-minutes

In [None]:
def get_bins(start_time, end_time, clean_pairs, bins_dict):    
    for i in range(start_time, end_time, 300):
        bins_dict[i] = set()

    bins_list = sorted(bins_dict.keys())

    for pair in clean_pairs:
        if (int(pair[2])<start_time or int(pair[2])>=end_time):
            continue            
        else:
            time_diff = (int(pair[2]) - start_time)/300
            time_bin = bins_list[time_diff]
            bins_dict[time_bin].add(tuple(sorted([pair[0], pair[1]])))

    return bins_dict

### Gender shuffling

In [None]:
%%cython
import random
def shuffle_gender(G, female_ids): 
    nodes = G.nodes()
    cdef int count_nodes = 0
    cdef list females_new = []
    cdef list males_new = []
    for node in nodes:
        if node in female_ids:
            count_nodes += 1
    random.shuffle(nodes)
    females_new = nodes[:count_nodes]    
    males_new = nodes[-(len(nodes)-count_nodes):]
    return [males_new, females_new]

### Triangles

In [None]:
def find_triangles(int_net):
    result=[] 
    done=set()    
    for n in int_net: 
        done.add(n)    
        nbrdone=set()     
        nbrs=set(int_net[n]) 
        for nbr in nbrs: 
            if nbr in done:     
                continue     
            nbrdone.add(nbr)    
            for both in nbrs.intersection(int_net[nbr]): 
                if both in done or both in nbrdone:    
                    continue    
                result.append( (n,nbr,both) ) 
    return result

def count_triangles(triangles, males_new, females_new):
    count_MMM = 0
    count_FFF = 0
    count = 0
    for node in triangles:
        if node[0] in males_new and node[1] in males_new and node[2] in males_new:
            count_MMM += 1
        elif node[0] in females_new and node[1] in females_new and node[2] in females_new:
            count_FFF += 1
        else:
            count += 1
    return [count_MMM, count_FFF, count]

### Entropies of contacts

In [None]:
def prox_entropy(user):
    probs = []
    if user:
        user = np.array(user)
        frequencies = Counter(user).values()    
        for freq in frequencies:
            probs.append(float(freq)/sum(frequencies))
        return entr(probs).sum(axis=0)/np.log(2)

### Weekly fraction of same gender friends

In [None]:
import numpy as np
import pandas as pd

def fraction_weekly(user, calls_dict, males_new, females_new):
    contacts = np.array(calls_dict[user])
    times = contacts[:,1]
    times = list(times)
    times = [int(time) for time in times]
    p_index = pd.to_datetime(times, unit='s')
    data = {"user2":contacts[:,0]}
    df = pd.DataFrame(data, index=p_index, columns=["user2"])
    records = df.groupby([df.index.week, df.user2]).size()
    frac_week = {}
    for week in range(0,54):
        if week in records:
            w_pairs = dict(records[week]).keys()
            if w_pairs:
                fraction = fraction_gender(w_pairs, males_new, females_new)        
                frac_week[week] = fraction
    return frac_week

def fraction_gender(friends, males_new, females_new):   
    m_count = 0
    for friend in friends:
        if friend in males_new:
            m_count += 1
    frac = float(m_count)/len(friends)        
    return frac

### Entropies of locations

In [None]:
from scipy.special import entr
from __future__ import division

def location_entropy(user):
    time_loc={}
    dist = {}
    #calculate time spent in a location
    for i in user:
        if i["label"] in time_loc:
            time_loc[i["label"]] = sec_spent(i["arrival"], i["departure"]) + time_loc[i["label"]]
        else:
            time_loc[i["label"]] = sec_spent(i["arrival"], i["departure"])
    
    #calculate the probability for each location
    for loc in time_loc:
        dist[loc] = float(time_loc[loc])/sum(time_loc.values())
    #return the entropy    
    return entr(dist.values()).sum(axis=0)/np.log(2)

def sec_spent(a,b):
    diff=(datetime.fromtimestamp(b) - datetime.fromtimestamp(a))

    return diff.total_seconds()

### Cosine similarity between weeks

In [None]:
import numpy as np
import pandas as pd
from scipy import spatial

def week_explore(user):
    user = np.array(user)
    times = user[:,0]
    times = list(times)
    times = [int(time) for time in times]
    p_index = pd.to_datetime(times, unit='s')
    data = {"label":user[:,1]}
    df = pd.DataFrame(data, index=p_index.week, columns=["label"])
    labels = set(user[:,1])
    weeks = set(p_index.week)
    records = dict(df.groupby([df.index, df.label]).size())
    for week in weeks:
        for label in labels:
            if (week, label) not in records:
                records[(week, label)] = 0
    new_dict = {}
    for week in weeks:
        data = {}
        for rec in records:
            if rec[0] == week:
                data[rec[1]] = records[rec]
        new_dict[week] = data
    
    new_df = pd.DataFrame(new_dict).T
    data = new_df.values
    m = len(weeks)
    mat = []
    for i in xrange(m-1):
        similarity = float(1 - spatial.distance.cosine(data[i,:], data[i+1,:]))
        mat.append(similarity)
    if mat:
        return np.mean(mat) 