In [1]:
users = [
    (id = 0, name = "Hero"),
    (id = 1, name = "Dunn"),
    (id = 2, name = "Sue"),
    (id = 3, name = "Chi"),
    (id = 4, name = "Thor"),
    (id = 5, name = "Clive"),
    (id = 6, name = "Hicks"),
    (id = 7, name = "Devin"),
    (id = 8, name = "Kate"),
    (id = 9, name = "Klein"),
];

In [2]:
friendship_pairs = [
    (0, 1),
    (0, 2),
    (1, 2),
    (1, 3),
    (2, 3),
    (3, 4),
    (4, 5),
    (5, 6),
    (5, 7),
    (6, 8),
    (7, 8),
    (8, 9),
];

In [3]:
# Initialize the dict with an empty list for each user id:
friendships = Dict(user.id => [] for user in users)

# And loop over the friendship pairs to populate it:
for (i, j) in friendship_pairs
    push!(friendships[i], j)
    push!(friendships[j], i)
end

friendships

Dict{Int64,Array{Any,1}} with 10 entries:
  0 => Any[1, 2]
  4 => Any[3, 5]
  7 => Any[5, 8]
  9 => Any[8]
  2 => Any[0, 1, 3]
  3 => Any[1, 2, 4]
  5 => Any[4, 6, 7]
  8 => Any[6, 7, 9]
  6 => Any[5, 8]
  1 => Any[0, 2, 3]

In [4]:
"How many friends does `user` have?"
function number_of_friends(user)
    friend_ids = friendships[user.id]
    length(friend_ids)
end

number_of_friends

In [5]:
number_of_friends(users[2])

3

In [6]:
total_connections = sum(number_of_friends(user) for user in users)

24

In [7]:
num_users = length(users)

10

In [8]:
avg_connections = total_connections / num_users

2.4

In [9]:
num_friends_by_id = [(user.id, number_of_friends(user)) for user in users]

10-element Array{Tuple{Int64,Int64},1}:
 (0, 2)
 (1, 3)
 (2, 3)
 (3, 3)
 (4, 2)
 (5, 3)
 (6, 2)
 (7, 2)
 (8, 3)
 (9, 1)

In [10]:
sort!(num_friends_by_id, by = id_and_friends -> id_and_friends[2], rev=true)

10-element Array{Tuple{Int64,Int64},1}:
 (1, 3)
 (2, 3)
 (3, 3)
 (5, 3)
 (8, 3)
 (0, 2)
 (4, 2)
 (6, 2)
 (7, 2)
 (9, 1)

In [11]:
"FOAF is short for 'friend of a friend'"
function foaf_ids_bad(user)
    [
        foaf_id for friend_id in friendships[user.id]
        for foaf_id in friendships[friend_id]
    ]

end

foaf_ids_bad

In [12]:
foaf_ids_bad(users[1])

6-element Array{Int64,1}:
 0
 2
 3
 0
 1
 3

In [13]:
using DataStructures

In [14]:
friends_of_friends(user) = counter(
    foaf_id 
    for friend_id in friendships[user.id]
        for foaf_id in friendships[friend_id] 
            if foaf_id != user.id && !(foaf_id in friendships[user.id]))

friends_of_friends (generic function with 1 method)

In [15]:
friends_of_friends(users[4])

Accumulator{Any,Int64} with 2 entries:
  0 => 2
  5 => 1

In [16]:
interests = [
    (0, "Hadoop"),
    (0, "Big Data"),
    (0, "HBase"),
    (0, "Java"),
    (0, "Spark"),
    (0, "Storm"),
    (0, "Cassandra"),
    (1, "NoSQL"),
    (1, "MongoDB"),
    (1, "Cassandra"),
    (1, "HBase"),
    (1, "Postgres"),
    (2, "Python"),
    (2, "scikit-learn"),
    (2, "scipy"),
    (2, "numpy"),
    (2, "statsmodels"),
    (2, "pandas"),
    (3, "R"),
    (3, "Python"),
    (3, "statistics"),
    (3, "regression"),
    (3, "probability"),
    (4, "machine learning"),
    (4, "regression"),
    (4, "decision trees"),
    (4, "libsvm"),
    (5, "Python"),
    (5, "R"),
    (5, "Java"),
    (5, "C++"),
    (5, "Haskell"),
    (5, "programming languages"),
    (6, "statistics"),
    (6, "probability"),
    (6, "mathematics"),
    (6, "theory"),
    (7, "machine learning"),
    (7, "scikit-learn"),
    (7, "Mahout"),
    (7, "neural networks"),
    (8, "neural networks"),
    (8, "deep learning"),
    (8, "Big Data"),
    (8, "artificial intelligence"),
    (9, "Hadoop"),
    (9, "Java"),
    (9, "MapReduce"),
    (9, "Big Data"),
]

49-element Array{Tuple{Int64,String},1}:
 (0, "Hadoop")
 (0, "Big Data")
 (0, "HBase")
 (0, "Java")
 (0, "Spark")
 (0, "Storm")
 (0, "Cassandra")
 (1, "NoSQL")
 (1, "MongoDB")
 (1, "Cassandra")
 (1, "HBase")
 (1, "Postgres")
 (2, "Python")
 ⋮
 (7, "machine learning")
 (7, "scikit-learn")
 (7, "Mahout")
 (7, "neural networks")
 (8, "neural networks")
 (8, "deep learning")
 (8, "Big Data")
 (8, "artificial intelligence")
 (9, "Hadoop")
 (9, "Java")
 (9, "MapReduce")
 (9, "Big Data")

In [17]:
"Find the ids of all users who like the target interest."
data_science_who_like(target_interest) = [
    user_id
    for
    (user_id, user_interest) in interests if user_interest == target_interest
]

data_science_who_like

In [18]:
data_science_who_like("Hadoop")

2-element Array{Int64,1}:
 0
 9

In [19]:
# Keys are interest, values are list of user_ids with that interest
user_ids_by_interest = DefaultDict(() -> [])

for (user_id, interest) in interests
    push!(user_ids_by_interest[interests], user_id)
end

In [20]:
# Keys are user_ids, values are lists of interest for that user_id
interests_by_user_id = DefaultDict(() -> [])

for (user_id, interest) in interests
    push!(interests_by_user_id[user_id], interest)
end


interests_by_user_id

DefaultDict{Any,Any,var"#21#22"} with 10 entries:
  0 => Any["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"]
  4 => Any["machine learning", "regression", "decision trees", "libsvm"]
  7 => Any["machine learning", "scikit-learn", "Mahout", "neural networks"]
  9 => Any["Hadoop", "Java", "MapReduce", "Big Data"]
  2 => Any["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"]
  3 => Any["R", "Python", "statistics", "regression", "probability"]
  5 => Any["Python", "R", "Java", "C++", "Haskell", "programming languages"]
  8 => Any["neural networks", "deep learning", "Big Data", "artificial intellig…
  6 => Any["statistics", "probability", "mathematics", "theory"]
  1 => Any["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]

In [21]:
most_common_interest_with(user) = counter(
    interested_user_id for interest in interests_by_user_id[user.id]
    for
    interested_user_id in user_ids_by_interest[interests] if
    interested_user_id != user.id
)

most_common_interest_with (generic function with 1 method)

In [22]:
most_common_interest_with(users[2])

Accumulator{Any,Int64} with 9 entries:
  0 => 35
  4 => 20
  7 => 20
  9 => 20
  2 => 30
  3 => 25
  5 => 30
  8 => 20
  6 => 20

In [23]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

10-element Array{Tuple{Int64,Real},1}:
 (83000, 8.7)
 (88000, 8.1)
 (48000, 0.7)
 (76000, 6)
 (69000, 6.5)
 (76000, 7.5)
 (60000, 2.5)
 (83000, 10)
 (48000, 1.9)
 (63000, 4.2)

In [24]:
salary_by_tenure = DefaultDict(() -> [])

DefaultDict{Any,Any,var"#27#28"} with 0 entries

In [25]:
for (salary, tenure) in salaries_and_tenures
    push!(salary_by_tenure[tenure], salary)
end

salary_by_tenure

DefaultDict{Any,Any,var"#27#28"} with 10 entries:
  0.7 => Any[48000]
  6.5 => Any[69000]
  7.5 => Any[76000]
  10  => Any[83000]
  4.2 => Any[63000]
  2.5 => Any[60000]
  8.7 => Any[83000]
  1.9 => Any[48000]
  6   => Any[76000]
  8.1 => Any[88000]

In [26]:
average_salary_by_tenure = Dict(tenure => sum(salaries) / length(salaries) for (tenure, salaries) in salary_by_tenure)

Dict{Real,Float64} with 10 entries:
  8.1 => 88000.0
  6.5 => 69000.0
  7.5 => 76000.0
  10  => 83000.0
  4.2 => 63000.0
  2.5 => 60000.0
  8.7 => 83000.0
  1.9 => 48000.0
  6   => 76000.0
  0.7 => 48000.0

In [27]:
tenure_bucket(tenure) = if tenure < 2
    "less than 2"
    elseif tenure < 5
    "between two and five"
else
    "more than five"
end

tenure_bucket (generic function with 1 method)

In [28]:
salary_by_tenure_bucket = DefaultDict(() -> [])

DefaultDict{Any,Any,var"#31#32"} with 0 entries

In [29]:
for (salary, tenure) in salaries_and_tenures
    bucket = tenure_bucket(tenure)
    push!(salary_by_tenure_bucket[bucket], salary)
end
salary_by_tenure_bucket

DefaultDict{Any,Any,var"#31#32"} with 3 entries:
  "less than 2"          => Any[48000, 48000]
  "between two and five" => Any[60000, 63000]
  "more than five"       => Any[83000, 88000, 76000, 69000, 76000, 83000]

In [30]:
average_salary_by_bucket = Dict(tenure_bucket => sum(salaries) / length(salaries) for (tenure_bucket, salaries) in salary_by_tenure_bucket)

Dict{String,Float64} with 3 entries:
  "less than 2"          => 48000.0
  "between two and five" => 61500.0
  "more than five"       => 79166.7

In [31]:
predict_paid_or_unpaid(years_experience) = if years_experience < 3
    "paid"
elseif years_experience < 8.5
    "unpaid"
else
    "paid"
end

predict_paid_or_unpaid (generic function with 1 method)

In [32]:
words_and_counts = counter(word for (user,interest) in interests for word in split(lowercase(interest)))

Accumulator{Any,Int64} with 39 entries:
  "storm"        => 1
  "deep"         => 1
  "statsmodels"  => 1
  "data"         => 3
  "mongodb"      => 1
  "r"            => 2
  "postgres"     => 1
  "big"          => 3
  "java"         => 3
  "c++"          => 1
  "languages"    => 1
  "spark"        => 1
  "scikit-learn" => 2
  "mahout"       => 1
  "programming"  => 1
  "networks"     => 2
  "decision"     => 1
  "libsvm"       => 1
  "mathematics"  => 1
  "theory"       => 1
  "statistics"   => 2
  "pandas"       => 1
  "haskell"      => 1
  "trees"        => 1
  "nosql"        => 1
  ⋮              => ⋮

In [33]:
for (word, count) in sort(collect(words_and_counts), by= kv -> kv[2], rev=true)
    if count > 1
        println(word, count)
    end
end

data3
big3
java3
python3
learning3
r2
scikit-learn2
networks2
statistics2
regression2
neural2
machine2
hadoop2
cassandra2
hbase2
probability2
