# Social Network Lab: finding connections, counts, sums and others [Basic Python For Data Science]

This lab assumes a social network-like environment with users and connections. We will setup our own basic dataset to perform some basic operations. Most examples and snippets belong to https://www.amazon.de/Data-Science-Scratch-Joel-Grus/dp/149190142X I added some comments and extra code for better understanding.


### Find connections

In [280]:
# The list of the website users
# Notice the structure, users is a list of dictionaries
users = [
    {'id':0,'name':'Hero'},
    {'id':1,'name':'Dunn'},
    {'id':2,'name':'Sue'},
    {'id':3,'name':'Chi'},
    {'id':4,'name':'Thor'},
    {'id':5,'name':'Clive'},
    {'id':6,'name':'Hicks'},
    {'id':7,'name':'Devin'},
    {'id':8,'name':'Kate'},
    {'id':9,'name':'Klein'},
]

# The friendship data, this is, the network connections as a pair of userId:friendId
friendships = [
    (0,1),(0,2),(1,2),(1,3),(2,3),(3,4),
    (4,5),(5,6),(5,7),(6,8),(7,8),(8,9)
];

#Add a list of friends to each user
for user in users:
    user['friends'] = []
    
# Fill in the list using the friendship data
for i,j in friendships: #i and j are integers and also represent users ids
    users[i]['friends'].append(users[j]) # i is now in the j friends list
    users[j]['friends'].append(users[i]) # and the other way around

print(users)

[{'id': 0, 'name': 'Hero', 'friends': [{'id': 1, 'name': 'Dunn', 'friends': [{...}, {'id': 2, 'name': 'Sue', 'friends': [{...}, {...}, {'id': 3, 'name': 'Chi', 'friends': [{...}, {...}, {'id': 4, 'name': 'Thor', 'friends': [{...}, {'id': 5, 'name': 'Clive', 'friends': [{...}, {'id': 6, 'name': 'Hicks', 'friends': [{...}, {'id': 8, 'name': 'Kate', 'friends': [{...}, {'id': 7, 'name': 'Devin', 'friends': [{...}, {...}]}, {'id': 9, 'name': 'Klein', 'friends': [{...}]}]}]}, {'id': 7, 'name': 'Devin', 'friends': [{...}, {'id': 8, 'name': 'Kate', 'friends': [{'id': 6, 'name': 'Hicks', 'friends': [{...}, {...}]}, {...}, {'id': 9, 'name': 'Klein', 'friends': [{...}]}]}]}]}]}]}]}, {'id': 3, 'name': 'Chi', 'friends': [{...}, {'id': 2, 'name': 'Sue', 'friends': [{...}, {...}, {...}]}, {'id': 4, 'name': 'Thor', 'friends': [{...}, {'id': 5, 'name': 'Clive', 'friends': [{...}, {'id': 6, 'name': 'Hicks', 'friends': [{...}, {'id': 8, 'name': 'Kate', 'friends': [{...}, {'id': 7, 'name': 'Devin', 'frien

### Total and average number of collections

In [281]:
total_connections = 0
for user in users:
    user_connections = len(user['friends'])
    total_connections += user_connections
    print("{} has {} total friend(s)\n".format(user['name'], user_connections))

Hero has 2 total friend(s)

Dunn has 3 total friend(s)

Sue has 3 total friend(s)

Chi has 3 total friend(s)

Thor has 2 total friend(s)

Clive has 3 total friend(s)

Hicks has 2 total friend(s)

Devin has 2 total friend(s)

Kate has 3 total friend(s)

Klein has 1 total friend(s)



In [282]:
# average connections
from __future__ import division
avg_connections = total_connections / len(users)

print(avg_connections)

2.4


In [283]:
# sort users by nr of connections
nr_friends_by_id = [(user['id'], len(user['friends'])) for user in users]
print(nr_friends_by_id) # the unsorted list

# Use sorted with a lambda function that returns a tuple (num_friends, user_id), 
# but since you want to sort the number of friends in descending order, you need to negate num_friends.
sorted_nr_friends_by_id = sorted(nr_friends_by_id, key=lambda item: (-item[1], item[0]))

print(sorted_nr_friends_by_id)


[(0, 2), (1, 3), (2, 3), (3, 3), (4, 2), (5, 3), (6, 2), (7, 2), (8, 3), (9, 1)]
[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


### People you may know

In [284]:
# people you may know are connections of your connections
def friends_of_friends(user):
    #foaf = friends of a friend
    return [foaf['id']
           for friend in user['friends']  # for each user's friend
            for foaf in friend['friends'] # get each of their friends
           ]

print(friends_of_friends(users[0])) ## People user 0 may know
# [0, 2, 3, 0, 1, 3] the repetition is due to the fact that some people are reachable through many other people



[0, 2, 3, 0, 1, 3]


In [285]:
from collections import Counter

def not_the_same(one, two):
    """ Two users are different if the have different ids"""
    return one['id']!=two['id']

def not_friends(one, two):
    """ Two users are not friends if one if not in two's friends list"""
    return all(not_the_same(one, two) for friend in user['friends'])

def friends_of_friend_ids(user):
    return Counter(foaf['id']
                  for friend in user['friends'] #foreach of my friends
                   for foaf in friend['friends'] # count their friends
                   if not_the_same(user, foaf) # who are not me
                   and not_friends(user, foaf) # and are not my friends
                  )

print(friends_of_friend_ids(users[3])) #User "Chi" has two mutual friends with user 0, 1 with 2, 1 with one and 1 with 5


Counter({0: 2, 2: 1, 1: 1, 5: 1})


### People with mutual interests

In [286]:
# A list of tuples with userID -> interest
interests = [
    (0, 'Python'),(0, 'Big Data'),(0, 'Flask'),(0, 'ReactJs'),
    (1, 'Java'),(1, 'NumPy'),(1, 'Statistics'),(1, 'Flask'),
    (2, 'MatPlotLib'),(2, 'AI'),(2, 'Python'),(2, 'Machine Learning'),
    (3, 'Big Data'),(3, 'Regression'),(3, 'AI'),(3, 'Flask'),
    (4, 'Java'),(4, 'NumPy'),(4, 'ReactJs'),(4, 'Machine Learning'),
    (5, 'AI'),(5, 'Python'),(5, 'MatPlotLib'),(5, 'Flask'),
    (6, 'Java'),(6, 'MatPlotLib'),(6, 'NumPy'),(6, 'Machine Learning'),
    (7, 'Big Data'),(7, 'Deep Learning'),(7, 'Python'),(7, 'Flask'),
    (8, 'Machine Learning'),(8, 'AI'),(8, 'Python'),(8, 'NumPy'),
    (9, 'deep learning'),(9, 'ReactJs'),(9, 'java'),(9, 'statistics'),
]

#find users with certain interests
def users_with_interest(target_interest):
    return [user_id
           for user_id, user_interest in interests
            if user_interest == target_interest
           ]

# Works, but we need to read all users and all interest every time
print(users_with_interest('Java'))

# Build an index of interest => user
from collections import defaultdict
#keys are interests, values are lists of users_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

print(user_ids_by_interest['ReactJs'])

# another index, interest => user id
interests_by_user_id = defaultdict(list)


      
for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

print(interests_by_user_id[9])

def most_common_interests_with(user):
    return Counter(interested_user_id
                  for interest in interests_by_user_id[user['id']]
                   for interested_user_id in user_ids_by_interest[interest]
                   if interested_user_id != user['id']
                  )

print(most_common_interests_with(users[1]))

[1, 4, 6]
[0, 4, 9]
['deep learning', 'ReactJs', 'java', 'statistics']
Counter({4: 2, 6: 2, 8: 1, 0: 1, 3: 1, 5: 1, 7: 1})


### Salaries and Experience

In [287]:
# salaries and experience in years
salary_experience = [
    (83000,8.7),(88000,8.1),(48000,0.7),
    (76000,6),(69000,6.5),(76000,7.5),
    (60000,2.5),(83000,10),(48000,1.9),(63000,4.2)
]
# Let's look at the avg salary per years of xp
salary_by_tenure = defaultdict(list)

for salary, tenure in salary_experience:
    salary_by_tenure[tenure].append(salary)

print('\nSalary by tenure: ')
print(salary_by_tenure)

average_salary_by_tenure = {
    tenure : sum(salaries)/len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

print('\nAverage by tenure: ')
print(average_salary_by_tenure)

#Let's make it more readable for the users
def tenure_bucket(tenure):
    if tenure < 2:
        return 'less than 2 years'
    elif tenure < 5:
        return 'between 2 and 5 years'
    else:
        return 'more than 5 years'

# Group salaries together
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salary_experience:
     bucket = tenure_bucket(tenure)
     salary_by_tenure_bucket[bucket].append(salary)

# Now compute the avg salary for each group
average_salary_by_bucket = {
    tenure_bucket : sum(salaries)/len(salaries)
    #AttributeError: 'collections.defaultdict' object has no attribute 'iteritems'
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}
print('\n Average salary by bucket: ')
print(average_salary_by_bucket)


Salary by tenure: 
defaultdict(<class 'list'>, {8.7: [83000], 8.1: [88000], 0.7: [48000], 6: [76000], 6.5: [69000], 7.5: [76000], 2.5: [60000], 10: [83000], 1.9: [48000], 4.2: [63000]})

Average by tenure: 
{8.7: 83000.0, 8.1: 88000.0, 0.7: 48000.0, 6: 76000.0, 6.5: 69000.0, 7.5: 76000.0, 2.5: 60000.0, 10: 83000.0, 1.9: 48000.0, 4.2: 63000.0}

 Average salary by bucket: 
{'more than 5 years': 79166.66666666667, 'less than 2 years': 48000.0, 'between 2 and 5 years': 61500.0}


### Topics Of Interest

In [288]:
# A simple way to find the most popular topics is to count the words
# Split words, change to lowercase and count
word_count = Counter(word
                    for user, interest in interests
                    for word in interest.lower().split())

for word, count in word_count.most_common():
    if count > 1:
        print("Word: {} Count: {}".format(word, count))

Word: learning Count: 6
Word: python Count: 5
Word: flask Count: 5
Word: java Count: 4
Word: numpy Count: 4
Word: ai Count: 4
Word: machine Count: 4
Word: big Count: 3
Word: data Count: 3
Word: reactjs Count: 3
Word: matplotlib Count: 3
Word: statistics Count: 2
Word: deep Count: 2
