In [13]:
import csv
import sys
import os
import re
import time
from collections import Counter
import numpy as np

import plotly.plotly as py
import plotly
from plotly import tools
from plotly.offline import init_notebook_mode
from plotly.graph_objs import *
import plotly.tools as tls

sys.path.insert(0, '../')
from meta_data_helper import EventMetaDataHelper

plotly.tools.set_credentials_file(username='sehwchoi', api_key='Ze9cXbcfuewb3C8TLktc')
init_notebook_mode(connected=True)  

In [14]:
mention_path = "../../user_network_data/aggregated_mention_networks"
user_set_path = "../../user_network_data/Event Users"

incident_metadata_path = '../incident_metadata.csv'

event_helper = EventMetaDataHelper(incident_metadata_path)
events = event_helper.get_all_events()
events = list(events)
events.sort()

In [15]:
def __read_user_set(file_name):
    user_set = []
    with open(file_name) as file:
        user_set_data = csv.reader(file)
        for row in user_set_data:
            user_id = row[0]
            if user_id != "id":
                user_set.append(int(user_id))
    return user_set


In [16]:
user_set_by_event = {}
def read_user_set_for_all_events(path):
    for root, dirs, files in os.walk(path):
        for filename in files:
            file_match = re.search("(\d+)_", filename)
            if file_match:
                event_id = file_match.group(1)
                user_set = __read_user_set(os.path.join(root, filename))
                # print("process event: {} user_set_len: {}".format(event_id, len(user_set)))
                user_set_by_event[int(event_id)] = user_set
    print("user_set length: {}".format(len(user_set_by_event)))

read_user_set_for_all_events(user_set_path)
    

user_set length: 304


In [17]:
def count_unique_user2(file_name, event):
    usr_unique_mention_ct = {}
    user_set = user_set_by_event[event]
    
    if os.path.isfile(file_name):
        with open(file_name) as file:
            data = csv.reader(file)
            for pair in data:
                try:
                    user1 = int(pair[0])
                    user2 = int(pair[1])
                    # print("user1: {} user2: {}".format(user1, user2))
                    # check if user1 exist as a key and increment mention count1
                    if user1 not in usr_unique_mention_ct:
                        # count for an unique user2
                        count1 = 1
                        count2 = 0
                        usr_unique_mention_ct[user1] = [count1, count2]
                    else:
                        usr_unique_mention_ct[user1][0] +=1

                    # increment count2 if the user2 is in the set
                    if user2 in user_set:
                        #print("user2: {} in user_set".format(user2))
                        usr_unique_mention_ct[user1][1] +=1
                except Exception as e:
                    print("file: {} error: {}".format(file_name, e))
    else:
        print("file: {} not exist".format(file_name))
            
            #print("user1_mention: {}".format(usr_unique_mention_ct[user1]))           
    #print("usr_unique_mention_ct: {}".format(usr_unique_mention_ct))
    return usr_unique_mention_ct

In [18]:
def get_mention_fraction(mention_ct):
    print(type(mention_ct))
    count1_max = max(value[0] for value in mention_ct.values())
    count2_max = max(value[1] for value in mention_ct.values())
    mention_freq = []
    total_user1_num = len(mention_ct.keys())
    # print("total_user_num: {} count1_max: {} count2_max: {}".format(total_user1_num, count1_max, count2_max))
    for i in range(0, count2_max+1):
        freq = 0
        for counts in mention_ct.values():
            if counts[1] == i:
                freq += 1
        # print("num_seq: {} freq: {}".format(i, freq))
        mention_freq.insert(i, freq)
        
    mention_fraction = []
    for freq in mention_freq:
        mention_fraction.append(freq/total_user1_num)
    
    return mention_fraction
                

In [19]:
def get_user2_fraction(mention_ct):
    fraction_list = []
    for user in mention_ct:
        fraction_list.append(mention_ct[user][1] / mention_ct[user][0])
    
    # print("fraction_list: {}".format(fraction_list))
    
    cnt = Counter(round(x, 1) for x in fraction_list)
    # print("fraction count list: {}".format(cnt))
    
    x_user2_frac_seq = [round(x * 0.1, 1) for x in range(0, 10)]
    y_user1_frac = []
    total_user1_num = len(mention_ct.keys())
    for frac in x_user2_frac_seq:
        # print("frac: {} cnt: {}".format(frac, cnt[frac]))
        if frac in cnt:
            y_user1_frac.append(cnt[frac] / total_user1_num)
        else:
            y_user1_frac.append(0)

    # print("y_user1_frac: {}".format(y_user1_frac))
    return [x_user2_frac_seq, y_user1_frac]

file_name = os.path.join(mention_path, "{}_mention_network.csv".format(44))
mention_ct = count_unique_user2(file_name, 44)
graph2_data = get_user2_fraction(mention_ct) 

In [21]:
def graph_for_mention_frac(path, events, draw_graph):
    avg_user2_mentioned_list = []
    for event in events:
        file_name = os.path.join(path, "{}_mention_network.csv".format(event))
        mention_ct = count_unique_user2(file_name, event)
        if len(mention_ct) > 0:
            mention_frac = get_mention_fraction(mention_ct)
            graph2_data = get_user2_fraction(mention_ct) # Counter dictionary
            x_user2_frac_seq = graph2_data[0]
            y_user1_frac = graph2_data[1]

            print(mention_frac)
            cnt2_sum = 0
            for user1 in mention_ct:
                cnt2_sum += mention_ct[user1][1]
            avg = cnt2_sum / len(mention_ct.keys())
            print("event: {} avg_usr2_cnt: {}".format(event, avg))
            avg_user2_mentioned_list.append(avg)

            if draw_graph:
                trace1 = Bar(
                        x=['%s'%i for i in range(0, len(mention_frac)+1)],
                        y=mention_frac,
                        showlegend=False)

                trace2 = Bar(
                        x=['{} ~ {}'.format(x, round(x+0.1, 1)) for x in x_user2_frac_seq],
                        y=y_user1_frac,
                        showlegend=False)

                layout=Layout(
                        xaxis=XAxis(title='Num of unique user2'),
                        yaxis=YAxis(title='Fraction of user1'),
                        xaxis2=XAxis(title='Fraction of unique user2'),
                        yaxis2=YAxis(title='Fraction of user1'),
                        title='Event {} User Mention Network Fraction'.format(event))

                fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Plot 1', 'Plot 2'))
                fig.append_trace(trace1, 1, 1)
                fig.append_trace(trace2, 1, 2)

                fig['layout']['xaxis1'].update(title='Num of unique user2')
                fig['layout']['xaxis2'].update(title='Fraction of unique user2')
                fig['layout']['yaxis1'].update(title='Fraction of user1')
                fig['layout']['yaxis2'].update(title='Fraction of user1')

                fig['layout'].update(title='Event {} User1 Mentioning Unique Number of User2'.format(event))
                plotly.offline.iplot(fig, filename="my plot")

    avg_count = Counter(round(x, 1) for x in avg_user2_mentioned_list)
    print("avg_count: {}".format(avg_count))
    avg_max = max(avg_count.keys())
    print("avg_max: {}".format(avg_max))
    avg_seq = [round(x*0.1, 1) for x in range(0, int(avg_max*10)+1)]
    print("avg_seq: {}".format(avg_seq))
    y_seq = []
    for seq in avg_seq:
        cnt = 0
        if seq in avg_count:
            cnt = avg_count[seq]
        y_seq.append(cnt)
    trace_avg = Bar(
                x=avg_seq,
                y=y_seq,
                showlegend=False)
    layout=Layout(
                xaxis=XAxis(title='Avg unique user2 mentioned'),
                yaxis=YAxis(title='Number of events'),
                title='Avg number of unique user2 mentioned by user2')

    data = Data([trace_avg])
    fig = Figure(data=data, layout=layout)
    plotly.offline.iplot(fig, filename="my plot")

    
graph_for_mention_frac(mention_path, events, True)

TypeError: '>' not supported between instances of 'dict_keys' and 'int'