# Introduction

## Finding Key Connectors

In [93]:
users = [ 
    { "id": 0, "name": "Hero" }, 
    { "id": 1, "name": "Dunn" }, 
    { "id": 2, "name": "Sue" }, 
    { "id": 3, "name": "Chi" }, 
    { "id": 4, "name": "Thor" }, 
    { "id": 5, "name": "Clive" }, 
    { "id": 6, "name": "Hicks" }, 
    { "id": 7, "name": "Devin" }, 
    { "id": 8, "name": "Kate" }, 
    { "id": 9, "name": "Klein" }
]

In [94]:
friendships = [(0,1), (0,2), (1,2), (1,3), (2,3), (3,4),
           (4,5), (5,6), (5,7), (6,8), (7,8), (8,9)]

In [95]:
for user in users:
    user["friends"] = []

In [96]:
for i, j in friendships:
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])

In [97]:
def number_of_friends(user):
    return len(user["friends"])

#### *Average number of friends*

In [104]:
sum(number_of_friends(user) for user in users) / len(users)

2.4

#### *Most connected people*

In [99]:
num_friends_by_id = [(user["id"], len(user["friends"])) for user in users]

In [100]:
sorted(num_friends_by_id, 
       key=lambda user: user[1], 
       reverse=True)

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

## Data Scientists You May Know

In [101]:
def friends_of_friend_ids_bad(user):
    return [foaf["id"]
             for friend in user["friends"]   # for each friend of user
             for foaf in friend["friends"]]  # get each of their friends

In [102]:
friends_of_friend_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [105]:
from collections import Counter # not loaded by default

In [108]:
def not_the_same(user, other_user):
    """two users are not the same if they have different ids"""
    return user["id"] != other_user["id"]

In [109]:
def not_friends(user, other_user):
    """other_user is not a friend if he's not in user["friends"];
    that is, if he's not_the_same as all the people in user["friends"]"""
    return all(not_the_same(friend, other_user)
               for friend in user["friends"])

In [110]:
def friends_of_friend_ids(user):
    return Counter(foaf["id"]
                   for friend in user["friends"]  # for each of my friends
                   for foaf in friend["friends"]  # count *their* friends
                   if not_the_same(user, foaf)    # who aren't me
                   and not_friends(user, foaf))   # and aren't my friends

In [111]:
print(friends_of_friend_ids(users[3])) # Counter({0: 2, 5: 1})

Counter({0: 2, 5: 1})


## Salaries and Experience

In [112]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [116]:
from collections import defaultdict
salary_by_tenure = defaultdict(list)

In [117]:
for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

In [120]:
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [None]:
{ tenure : sum(salaries) / len(salaries) for tenure, salaries in salary_by_tenure.items() }

In [121]:
average_salary_by_tenure

{8.7: 83000.0,
 8.1: 88000.0,
 0.7: 48000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 2.5: 60000.0,
 10: 83000.0,
 1.9: 48000.0,
 4.2: 63000.0}

In [122]:
def tenure_bucket(tenure):
    if tenure < 2: return "less than two"
    elif tenure < 5: return "between two and five"
    else: return "more than five"
    
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

average_salary_by_bucket = {
  tenure_bucket : sum(salaries) / len(salaries)
  for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

In [123]:
average_salary_by_bucket

{'more than five': 79166.66666666667,
 'less than two': 48000.0,
 'between two and five': 61500.0}

## Paid Accounts

In [124]:
def predict_paid_or_unpaid(years_experience):
  if years_experience < 3.0: return "paid"
  elif years_experience < 8.5: return "unpaid"
  else: return "paid"

## Topics of Interest

In [125]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [129]:
interest_counts = defaultdict(int)

for k,v in interests:
    interest_counts[v] += 1
    
sorted(interest_counts.items())

[('Big Data', 3),
 ('C++', 1),
 ('Cassandra', 2),
 ('HBase', 2),
 ('Hadoop', 2),
 ('Haskell', 1),
 ('Java', 3),
 ('Mahout', 1),
 ('MapReduce', 1),
 ('MongoDB', 1),
 ('NoSQL', 1),
 ('Postgres', 1),
 ('Python', 3),
 ('R', 2),
 ('Spark', 1),
 ('Storm', 1),
 ('artificial intelligence', 1),
 ('decision trees', 1),
 ('deep learning', 1),
 ('libsvm', 1),
 ('machine learning', 2),
 ('mathematics', 1),
 ('neural networks', 2),
 ('numpy', 1),
 ('pandas', 1),
 ('probability', 2),
 ('programming languages', 1),
 ('regression', 2),
 ('scikit-learn', 2),
 ('scipy', 1),
 ('statistics', 2),
 ('statsmodels', 1),
 ('theory', 1)]

In [133]:
words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
