**Note:** Windows keystroke for \` character is ALT+96

In [4]:
import os
import sys
import networkx as nx
import operator
import json
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [3]:
# Global Settings
dataset_dir = "../mmr_graph/"

# Data Preparation
Storing Twitter usernames without any transformation in memory would mean wasting loads of memory to store a relatively expensive data type such as strings. I could optimize this by performing some *data encoding* on the user-names and convert strings to integers by keeping a 1:1 mapping between the string representation of the username and its integer representation. This would result in huge memory optimization prior to loading the graph into memory.

## Twitter Usernames Encoding and Full Network
The below code portions process all the data files from the MMR dataset and iteratively add edges to the undirected graph. The end result of the execution are two files: `usernames.json` containing a JSON with string username-integer username as key-value pairs, and `mmr.adjlist` which contains the graph representation that can be easily loaded again in memory without processing again all the dataset files.

In [7]:
# Support functions declarations
def add_new_user(username):
    global current_user_id
    global unique_usernames_encoding
    if not username in unique_usernames_encoding:
        unique_usernames_encoding[username] = current_user_id
        current_user_id += 1
        if current_user_id%10000000 == 0:
            print("Currently processed {0} usernames".format(current_user_id))

def get_encoding(username):
    global unique_usernames_encoding
    return unique_usernames_encoding[username]

def process_line(line):
    a,b = line.strip("()\n").split(', ')
    a = a.strip("u'")
    b = b.strip("u'")
    add_new_user(a)
    add_new_user(b)
    a_enc = get_encoding(a)
    b_enc = get_encoding(b)
    return a_enc, b_enc

In [5]:
# Start processing
unique_usernames_encoding = {}
current_user_id = 0 #Start with ID = 0

# Dynamically add edges to graph
G = nx.Graph()
for folder in os.listdir(dataset_dir):
    folder_path = os.path.join(dataset_dir, folder)
    print("Processing files in folder {0}".format(folder))
    for part in os.listdir(folder_path):
        part_path = os.path.join(folder_path, part)
        if os.path.isfile(part_path) and part.startswith("part"):
            with open(part_path, encoding="utf-8") as part:
                for line in part:
                    a_enc, b_enc = process_line(line)
                    G.add_edge(a_enc, b_enc)
print("Processed all usernames. Total unique usernames: {0}".format(current_user_id-1))
           
# Save usernames to file
output_filename = "usernames.json"
print("Saving usernames to {0}...".format(output_filename))
with open(output_filename, "w") as usernames_file:
    json.dump(unique_usernames_encoding, usernames_file)
print("Done!")

# Save graph to file to easily import again
nx.write_adjlist(G, "mmr.adjlist")

Processing files in folder 2016-03
Currently processed 10000000 usernames
Currently processed 10000000 usernames
Currently processed 10000000 usernames
Currently processed 10000000 usernames
Processing files in folder 2016-09
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames
Currently processed 20000000 usernames


KeyboardInterrupt: 

## Build Main Network

In [192]:
betweenness_centrality = networkx.betweenness_centrality(g)

KeyboardInterrupt: 

In [175]:
[x for x in sorted(betweenness_centrality.items(), key=operator.itemgetter(1), reverse=True) if x[1] != 0]

[('brucevh', 2.0507457024060374e-08),
 ('eceyorenc', 2.0507457024060374e-08),
 ('bcci', 2.0507457024060374e-08),
 ('rongieprk', 2.0507457024060374e-08),
 ('tifayneh', 2.0507457024060374e-08),
 ('derbyyyy', 2.0507457024060374e-08),
 ('nesaazzahra', 2.0507457024060374e-08),
 ('kemova99', 2.0507457024060374e-08),
 ('psychdatageek', 2.0507457024060374e-08)]