In [1]:
using Printf

# 1. Adding encoded metadata to the MMR edges
We first need to encode the information of the period. We could add *strings* as metadata, but using integers would again be beneficial in terms of performance. The CSV file storing the periods encoding produced by the below cell will be named `period_encoding.csv`.
## 1.1 Encoding the period values

In [2]:
period_encoding_header = Array{String,1}(["period","encoding"])
period_encoding_filename = String("../data/period_encoding.csv")

open(period_encoding_filename, "w") do io
    # Write CSV header
    write(io, join(period_encoding_header, ",") * "\n")
    for (index, period) in enumerate(readdir(String("../data/mmr_graph")))
        write(io, join([period,index-1],",") * "\n")
    end
end

# Read period encoding as a dictionary
period_encoding_dict = Dict{String,Int8}()
open(period_encoding_filename) do io
    for line in eachline(io)
        if !occursin(line, join(period_encoding_header, ","))
            period, encoding = split(line, ",")
            period_encoding_dict[period] = parse(Int8, encoding)
        end
    end
end

## 1.2 Compute period by line number in the dataset
Instead of re-running the whole original encoding script in Python and add the period information for each line of the `mmr_encoded.csv` file, I may have a hashtable storing which are the *starting and ending lines* of each period, so that I can quickly create a duplicate of the `mmr_encoded.csv` data file that has the additional info. Of course this is possible because the merged file `mmr_encoded.csv` has been generated by respecting the chronological ordering of the period folders. The end result of the function `compute_periods_lines(...)` below is a hashtable of type `Dict{K,V}` where $K=period_{enc}$ and $V=[line_{start},line_{end})$, so that $line$ is the line number in the full MMR dataset when all parts are merged in a single file, as it happens with `mmr_encoded.csv`.

In [3]:
function compute_periods_lines(dataset_dir::String)::Dict{Int8, Array{Int64, 1}}
    periods_by_line_dict = Dict{Int8, Array{Int64, 1}}()
    current_line_count = Int64(0)
    for folder in readdir(dataset_dir)
        period_enc = period_encoding_dict[folder]
        folder_path = joinpath(dataset_dir, folder)
        if !haskey(periods_by_line_dict, period_enc)
            periods_by_line_dict[period_enc] = Array{Int64, 1}()
        end
        push!(periods_by_line_dict[period_enc], current_line_count)
        for part in readdir(folder_path)
            part_path = joinpath(folder_path, part)
            if isfile(part_path) && startswith(part, "part")
                open(part_path) do part_file
                    for line in eachline(part_file)
                        current_line_count += 1
                    end
                end
            end
        end
        push!(periods_by_line_dict[period_enc], current_line_count)
    end
    return periods_by_line_dict
end

compute_periods_lines (generic function with 1 method)

In [4]:
# Define variables
dataset_dir = String("../data/mmr_graph")

@time periods_by_line_dict = compute_periods_lines(dataset_dir)

 48.621911 seconds (1.84 G allocations: 50.416 GiB, 6.43% gc time)


Dict{Int8,Array{Int64,1}} with 13 entries:
  2  => [143085317, 201659109]
  11 => [535041137, 575743666]
  0  => [0, 75459299]
  7  => [366285189, 412736805]
  9  => [453080927, 493968535]
  10 => [493968535, 535041137]
  8  => [412736805, 453080927]
  6  => [322066905, 366285189]
  4  => [249380863, 287502444]
  3  => [201659109, 249380863]
  5  => [287502444, 322066905]
  12 => [575743666, 612497446]
  1  => [75459299, 143085317]

## 1.3 Add period information to each line
Now we can just iterate over the lines of the `mmr_encoded.csv` file and append the encoded period, by looking up the value to append in the just created `periods_by_line_dict`. The new file will be called `mmr_encoded_with_period.csv`.

In [6]:
function get_enc_period_by_line(periods_by_line_dict::Dict{Int8,Array{Int64,1}}, current_line_count::Int64)::Int8
    local period_enc
    for key in keys(periods_by_line_dict)
        if current_line_count >= periods_by_line_dict[key][1] && current_line_count < periods_by_line_dict[key][2]
            period_enc = key
            break
        end
    end
    return period_enc
end

function create_mmr_with_period(periods_by_line_dict::Dict{Int8,Array{Int64,1}}, mmr_enc_with_period_fn::String, mmr_enc_fn::String)
    current_line_count = Int64(0)
    local period_enc
    open(mmr_enc_fn) do input
        open(mmr_enc_with_period_fn, "w") do output
            for line in eachline(input)
                period_enc = get_enc_period_by_line(periods_by_line_dict, current_line_count)
                new_line = join([line, string(period_enc)], ",")
                write(output, new_line * "\n")
                current_line_count +=1
            end
        end
    end
end

create_mmr_with_period (generic function with 1 method)

In [7]:
mmr_enc_with_period_fn = String("../data/mmr_encoded_with_period.csv")
mmr_enc_fn = String("../data/mmr_encoded.csv")

@time create_mmr_with_period(periods_by_line_dict, mmr_enc_with_period_fn, mmr_enc_fn)

563.598310 seconds (8.58 G allocations: 358.149 GiB, 5.05% gc time)


## 1.4 Generating final MMR dataset file
After the period has been added as encoded information to each individual line in the dataset file, I can now generate the desired final version of the MMR datafile where each $(n_1,n_2)$ entry has the additional `periods` value as an **array of integers**, such that duplicate relationship entries are also removed.

In [43]:
function split_line(line::String)
    n1, n2, value = split(line, ",")
    key = join([n1, n2], ",")
    value = parse(Int8, value)
    return key, value
end

function compute_edge_periods(mmr_enc_with_period_fn::String)::Dict{String,AbstractArray{Int8,1}}
    edge_periods_dict = Dict{String,AbstractArray{Int8,1}}()
    lines = Int64(0)
    open(mmr_enc_with_period_fn) do input
        for line in eachline(input)
            if lines % 50000000 == 0 && lines != 0
                @printf "Processed %d lines...\n" lines
            end
            key, value = split_line(line)
            if !haskey(edge_periods_dict, key)
                edge_periods_dict[key] = Array{Int8,1}()
            end
            push!(edge_periods_dict[key], value)
            lines += 1
            flush(STDOUT)
        end
    end
    println("Done!")
    return edge_periods_dict
end

function dump_mmr_to_file(mmr_final_filename::String, edge_periods_dict::Dict{String,AbstractArray{Int8,1}})
    println("Dumping final MMR to file...")
    open(mmr_final_filename, "w") do output
        for key in keys(edge_periods_dict)
            write(output, join([key, join(edge_periods_dict[key], ",")], ",") * "\n")
        end
    end
    println("Done!")
end

dump_mmr_to_file (generic function with 1 method)

In [None]:
mmr_final_filename = String("../data/mmr_encoded_final.csv")

@time edge_periods_dict = compute_edge_periods(mmr_enc_with_period_fn)
@time dump_mmr_to_file(mmr_final_filename, edge_periods_dict)

Processed 50000000 lines...
Processed 100000000 lines...
Processed 150000000 lines...
Processed 

I've included the code from the cells above in a separate `compute_compact_mmr.jl` script and let it run overnight. Timing stats at the end of the script showed that a peak of **90GB of consumed RAM** was reached and the overall execution time was **~18 hours**. Without the hardware capabilities I've been provided with, executing this script would have required relying on a line-by-line approach, which would have contributed to significantly slow down the process even more.

Counting the number of lines with the command `wc -l mmr_encoded_final.csv` returns **434.193.958**, which represents the *total number of unique mutual interactions between two distinct users*. 10 sample lines of the file are formatted as such:
```
162551,358055,0;1;2;3;4
2522022,11479046,0
6765991,37193347,2
1925412,69668768,12
30945323,21996213,1
1929344,1758244,0;1;2;3
14179682,4572078,0
47426332,12183913,3
44640010,44517870,3;4;5
12932181,80621407,10;12
```
As it's clear from th example above, the CSV is now structured with *3 columns*:
- The first two represent the **mutual interaction between two Twitter users** as a $(source, target)$ pair;
- The third one is a list of semicolon-separated values that indicates **all the periods when the correspondent mutual interaction occurred**, with values ranging from 0 to 12.

As a final validation check, let's show that such list of periods:
- Has maximum length of 13;
- Has unique values (e.g. it's not allowed to have more than a unique digit from 0 to 12 in the same list)

In [9]:
function validate_entry(entry::String, current_line::Int64)::Int64
    issues = Int64(0)
    # Condition (1): max length 13
    periods_array = map(x -> parse(Int8, x), split(split(entry, ",")[3], ";"))
    if length(periods_array) > 13
        println("Found periods array with more than the maximum allowed number of values (13) at line $current_line: $entry")
        issues += 1
    end
    
    # Condition (2): All values should be unique
    if length(periods_array) != length(Set{Int8}(periods_array))
        println("Found periods array with non unique values at line $current_line: $entry")
        issues += 1
    end
    
    return issues
end

function find_issues()
   mmr_final_filename = String("../data/mmr_encoded_final.csv")
    open(mmr_final_filename) do io
        counter = Int64(0)
        issues = Int64(0)
        for line in eachline(io)
            issues += validate_entry(line, counter)
            counter +=1
            if counter % 50000000 == 0
                println("Processed $counter lines...")
            end
        end
        println("Done! Found $issues issues.")
    end
end

@time find_issues()

Processed 50000000 lines...
Processed 100000000 lines...
Processed 150000000 lines...
Processed 200000000 lines...
Processed 250000000 lines...
Processed 300000000 lines...
Processed 350000000 lines...
Processed 400000000 lines...
Done! Found 0 issues.
503.230951 seconds (9.77 G allocations: 453.569 GiB, 13.03% gc time)


The above results show that the final dataset file is properly formatted without unexpected errors. As a final chapter consideration, I'm now ready to re-import the dataset in Neo4j and see how the new network would look like after applying the optimizations as described in this notebook.

Hereby a **summary of the core improvements** as a result of the optimization process:

| Dataset Feature | Old | New |
|---|---|---|
| Total Relationships | 612.497.446 (~600M) | 434.193.958 (~400M) |
| File Size (GB) | 9.8 | 8.3 |