In [4]:
# 一組字典(dict)資料
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

# ( , ) 為一個元組(tuple)
friendships = [(0,1), (0,2), (1,2), (1,3), (2,3), (3,4),
           (4,5), (5,6), (5,7), (6,8), (7,8), (8,9)]

# friends 的屬性對應一個空的 list
for user in users:
    user["friends"] = []
    
for i, j in friendships:
    # users[i]就是 id編號為 i 的使用者
    users[i]["friends"].append(users[j])  # 把 j 添加為 i 的朋友
    users[j]["friends"].append(users[i])  # 把 i 添加為 j 的朋友

In [5]:
# 把每個人的朋友數量加總，計算總數量
def number_of_friends(user):
    '''每個 user有幾個朋友?'''
    return len(user["friends"])                     # 鍵值 friends 所對應的列表長度，
                                                    # 就是朋友的數量
total_connections = sum(number_of_friends(user)
                       for user in users)           # 24

# 再除以使用者的總數量
from __future__ import division                     # 預設情況下沒有變數除法的功能
                                                    # 因此要額外導入
num_users = len(users)                              # users 列表的長度 ; 也就是使用者
                                                    # 的數量
avg_connections = total_connections / num_users     # 2.4

In [8]:
# 建立一個列表 (使用者編號 , 朋友數量 )
num_friends_by_id = [(user["id"], number_of_friends(user))
                    for user in users]

sorted(num_friends_by_id,                                   # 進行排序
      key = lambda (user_id, num_friends): num_friends,     # 根據朋友的數量
      reverse = True)                                       # 從最多排到最少

# 每一對資料都是 (使用者編號 , 朋友數量 )
# [(1,3), (2,3), (3,3), (5,3), (8,3),
# (0,2), (4,2), (6,2), (7, 2), (9,1)]
# 作分支中心度

SyntaxError: invalid syntax (<ipython-input-8-6c28865bf810>, line 6)

In [9]:
def friends_of_friend_ids_bad(user):
    # "foaf" 就是 "friend of a friend" (朋友的朋友) 的縮寫
    return [foaf["id"]
           for friend in user["friends"]      # 針對使用者的每一位朋友
           for foaf in friend["friends"]]     # 取得這些朋友們的每一位朋友

print([friend["id"] for friend in users[0]["friends"]])     # [1, 2]
print([friend["id"] for friend in users[1]["friends"]])     # [0, 2, 3]
print([friend["id"] for friend in users[2]["friends"]])     # [0, 1, 3]

[1, 2]
[0, 2, 3]
[0, 1, 3]


In [10]:
# 排除使用者原本就認識的人
from collections import Counter

def not_the_same(user, other_user):
    '''如果兩個使用者id編號不同，就不是同一個使用者'''
    return user["id"] != other_user["id"]

def not_friends(user, other_user):
    '''如果other_user不在user["friends"]裡頭，他就不是朋友 ; 
    也就是說，他和user["friends"]裡的所有人都 not_the_same(不是同一個使用者)'''
    return all(not_the_same(friend, other_user)
              for friend in user["friends"])

def friends_of_friend_ids(user):
    return Counter(foaf["id"]
                  for friend in user["friends"]     # 針對我所有的朋友 
                  for foaf in friend["friends"]     # 計算「他們的」朋友
                  if not_the_same(user, foaf)       # 既不是我自己
                  and not_friends(user, foaf))      # 也不是我原本就認識的朋友

print(friends_of_friend_ids(users[3]))              # Counter({0: 2, 5: 1})

Counter({0: 2, 5: 1})


In [11]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"), 
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"), 
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"), 
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"), 
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"), 
    (3, "statistics"), (3, "regression"), (3, "probability"), 
    (4, "machine learning"), (4, "regression"), (4, "decision trees"), 
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"), 
    (6, "probability"), (6, "mathematics"), (6, "theory"), 
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"), 
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"), 
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

# 建立共同興趣的函式
def data_scientists_who_like(target_interest):
    return [user_id
           for user_id, user_interest in interests
           if user_interest == target_interest]

In [14]:
# 興趣對應到使用者的索引
from collections import defaultdict

# 鍵值是 interest (興趣)，值則是具有該興趣的使用者編號列表
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

# 使用者對應到興趣的索引
# 鍵值是 user_id (使用者編號)，值則是該使用者的興趣列表
interests_by_user_id = defaultdict(list)
for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)
    
# 找出共同興趣
def most_commmon_interests_with(user):
    for interest in interests_by_user_id[user["id"]]:
    for interested_user_id in user_ids_by_interest[interest]:
    if interested_user_id != user["id"]

IndentationError: expected an indented block (<ipython-input-14-01819ffa1f50>, line 19)

In [24]:
from collections import defaultdict

salaries_and_tenures = [ (83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2) ]

# 鍵值是 tenure(年資)，值則是每個年資對應到的薪水列表
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
    
# 鍵值是 tenure(年資)，值則是相應年資所對應到的平均薪水
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

print(sorted(salaries_and_tenures))
print(sorted(salary_by_tenure))

[(48000, 0.7), (48000, 1.9), (60000, 2.5), (63000, 4.2), (69000, 6.5), (76000, 6), (76000, 7.5), (83000, 8.7), (83000, 10), (88000, 8.1)]
[0.7, 1.9, 2.5, 4.2, 6, 6.5, 7.5, 8.1, 8.7, 10]


In [29]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "betwween two and five"
    else:
        return "more than five"
    
# 鍵值是 tenure bucket(年資分組)，值則是相應分組所對應到的薪水列表
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)
    
# 鍵值是 tenure bucket(年資分組)，值則是相應分組所對應的平均薪水
average_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.iteritems()    
}

print(tenure_bucket)

AttributeError: 'collections.defaultdict' object has no attribute 'iteritems'

In [33]:
# 建立年資中間者的付款傾向
def predict_paid_or_unpaid(years_expensive):
    if years_experience < 3.0:
        return "paid"
    elif years_experience < 8.5:
        return "unpaid"
    else:
        return "paid"
    
print(predict_paid_or_unpaid)

<function predict_paid_or_unpaid at 0x00000233A304F2F0>


In [34]:
words_and_counts = Counter(word
                          for user, interest in interests
                          for word in  interest.lower().split())

for word, count in words_and_counts.most_common():
    if count > 1:
        print (word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
