# Chapter 1. Introduction

## Practical Finding Key Connector

In [1]:
users = [
   { "id":0, "name": "Hero" },
   { "id":1, "name": "Dunn" },
   { "id":2, "name": "Sue" },
   { "id":3, "name": "Chi" },
   { "id":4, "name": "Thor" },
   { "id":5, "name": "Clive" },
   { "id":6, "name": "Hicks" },
   { "id":7, "name": "Devin" },
   { "id":8, "name": "Kate" },
   { "id":9, "name": "Klein" }
]       

In [2]:
friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
                  (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [3]:
friendships = { user["id"]: [] for user in users}

In [4]:
# Populate the friendships
for i, j in friendship_pairs:
    friendships[i].append(j)
    friendships[j].append(i)

In [5]:
def number_of_friends(user):
  user_id = user["id"]
  friend_ids = friendships[user_id]
  return len(friend_ids)

In [6]:
total_connections = sum(number_of_friends(user) for user in users)
num_users = len(users)
avg_connections = total_connections / num_users

print(avg_connections)

2.4


In [8]:
# Get the most connected users

num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
num_friends_by_id.sort(key=lambda id_and_friends: id_and_friends[1], reverse=True)

print(num_friends_by_id)

[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


In [9]:
# The Friend of Friend effect
from collections import Counter

def friends_of_friends(user):
    user_id = user["id"]
    return Counter(
        foaf_id
        for friend_id in friendships[user_id]
        for foaf_id in friendships[friend_id]
        if foaf_id != user_id
        and foaf_id not in friendships[user_id]
    )


In [10]:
print(friends_of_friends(users[3]))

Counter({0: 2, 5: 1})


### Find people of similar interest

In [11]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [14]:
# classical way
def data_scientist_who_like(target_interest):
    return [user_id for user_id, interest in interests
    if interest == target_interest]

print(data_scientist_who_like("Python"))

[2, 3, 5]


In [15]:
# the proper way with an indec
from collections import defaultdict

# create a dict of user by interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)
    
# create a dict of interest by user
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

In [16]:
# Find the most common interest
def most_common_interests_with(user):
    user_id = user["id"]
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user_id]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user_id
    )

In [22]:
for user in users:
    top_interest = most_common_interests_with(user)
    print("for user ", user["name"], " common person ", len(top_interest))
    print(top_interest)

for user  Hero  common person  4
Counter({9: 3, 1: 2, 8: 1, 5: 1})
for user  Dunn  common person  1
Counter({0: 2})
for user  Sue  common person  3
Counter({3: 1, 5: 1, 7: 1})
for user  Chi  common person  4
Counter({5: 2, 6: 2, 2: 1, 4: 1})
for user  Thor  common person  2
Counter({7: 1, 3: 1})
for user  Clive  common person  4
Counter({3: 2, 2: 1, 0: 1, 9: 1})
for user  Hicks  common person  1
Counter({3: 2})
for user  Devin  common person  3
Counter({4: 1, 2: 1, 8: 1})
for user  Kate  common person  3
Counter({7: 1, 0: 1, 9: 1})
for user  Klein  common person  3
Counter({0: 3, 5: 1, 8: 1})
