# Chapter 1 Introduction

In [82]:
users= [
    {"id" : 0, "name": "Hero"  },
    {"id" : 1, "name": "Dunn"  },
    {"id" : 2, "name": "Sue"   },
    {"id" : 3, "name": "Chi"   },
    {"id" : 4, "name": "Thor"  },
    {"id" : 5, "name": "Clive" },
    {"id" : 6, "name": "Hicks" },
    {"id" : 7, "name": "Devin" },
    {"id" : 8, "name": "Kate"  },
    {"id" : 9, "name": "Klein" },
]

In [83]:
# list of paired id numbers who are friends
# Hero and Dunn are friends (0,1)
friendships = [(0,1),(0,2),(1,2),(1,3),(2,3),(3,4),
               (4,5),(5,6),(5,7),(6,8),(7,8),(8,9)]

In [84]:
#add a list of friends to each user
for user in users:
    user["friends"]=[]

In [85]:
for i,j in friendships:
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])

In [86]:
def number_of_friends(user):
    """how many friends does user have"""
    return len(user["friends"])

In [87]:
total_connections = sum(number_of_friends(user)
                       for user in users)
print("total number of connections:",total_connections)

total number of connections: 24


In [88]:
from __future__ import division
num_users = len(users)
avg_connections = total_connections / num_users
print("average number of connections:",avg_connections)

average number of connections: 2.4


### sorting from most friends to least friends


In [89]:
#create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"], number_of_friends(user))
                    for user in users]

In [90]:
sorted(num_friends_by_id,
      key = lambda user_id: user_id[1],
      reverse = True)

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

## friends you might know (suggestions)

In [91]:
def friends_of_friends_ids_bad(user):
    #foaf is short for friends of friends
    return[foaf["id"]
          for friend in user['friends']  # for each of users friends
          for foaf in friend['friends']] # get each of their friends

In [92]:
friends_of_friends_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [93]:
print([friend['id'] for friend in users[0]['friends']])
print([friend['id'] for friend in users[1]['friends']])
print([friend['id'] for friend in users[2]['friends']])

[1, 2]
[0, 2, 3]
[0, 1, 3]


In [94]:
from collections import Counter

In [181]:
def not_the_same(user, other_user):
    return user['id'] != other_user['id']

In [182]:
def not_friends(user, other_user):
    """other_user is not a friend if he's not in user['friends']; that is, if hes not_the_same as all the people in user['friends']"""
    return all(not_the_same(friend, other_user)
              for friend in user['friends'])

In [187]:
def friends_of_friend_id(user):   
    return Counter(foaf['id']
                  for friend in user['friends'] # for each of my friends
                  for foaf in friend['friends'] # count their friends
                  if not_the_same(user,foaf) 
                  and not_friends(user,foaf))   # and arent my friends

### This shows Chi (user[3])  has 2 mutual friends with user[0] and 1 mutual friend with user[5]

In [190]:
print(friends_of_friend_id(users[3]))

Counter({0: 2, 5: 1})


In [192]:
interests = [
    (0, "Hadoop"), (0, 'Big Data'), (0, 'HBase'), (0, 'Java'), (0, 'Spark'), (0, 'Storm'), (0, 'Cassandra'),
    (1, 'NoSQL'), (1, "MongoDB"), (1, 'Cassandra'), (1, "HBase"), (1, 'Postgres'), 
    (2, 'Python'), (2, 'scikit-learn'), (2, 'scipy'), (2, 'numpy'), (2, 'statsmodels'), (2, 'pandas'), 
    (3, 'R'), (3, 'Python'), (3, 'statistics'), (3, 'regression'), (3, 'probability'),
    (4, 'machine learning'), (4, 'regression'), (4, 'decision trees'), (4, 'libsvm'),
    (5, 'Python'), (5, 'R'), (5, 'Java'), (5, 'C++'), (5, 'Haskell'), (5, 'programming languages'), 
    (6, 'statistics'), (6, 'probability'), (6, 'mathematics'), (6, 'theory'),
    (7, 'machine learning'), (7, 'scikit-learn'), (7, 'Mahout'), (7, 'neural networks'), 
    (8, 'neural networks'), (8, 'deep learning'), (8, 'Big Data'), (8, 'artificial intelligence'), 
    (9, 'Hadoop'), (9, 'Java'), (9, 'MapReduce'), (9, 'Big Data')
]

In [193]:
def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

### this works if the list is small, however it is not very efficent

In [195]:
from collections import defaultdict

In [198]:
#keys are interests, values are lists of user_ids with that interest
user_ids_by_interests = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interests['interests'].append(user_id)

In [199]:
# keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id['user_id'].append(interest)

In [207]:
def most_common_interests_with(user):
    return Counter(interested_user_id
                   for interest in interests_by_user_id[user['id']]
                   for interested_user_id in user_ids_by_interests['interest']
                   if interested_user_id != user['id'])

## Salaries and Experience

In [210]:
salaries_and_tenures = [(83000, 8.7), (88000,  8.1),
                        (48000, 0.7), (76000,  6.0),
                        (69000, 6.5), (76000,  7.5),
                        (60000, 2.5), (83000, 10.0),
                        (48000, 1.9), (63000,  4.2)]

In [211]:
#Keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

In [212]:
for salary, tenure in salaries_and_tenures:
    salary_by_tenure['tenure'].append(salary)


In [219]:
# Keys are years, each values is average salary for that tenure 
average_salary_by_tenure = {
    sum(salaries)/ len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [221]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than 2"
    elif tenure < 5:
        return "between 2 and 5"
    else:
        return "more than 5"

In [222]:
# keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

In [223]:
# Keys are tenure buckets, values are average salary fo that bucket
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

AttributeError: 'collections.defaultdict' object has no attribute 'iteritems'