<a href="https://colab.research.google.com/github/robabsmith/CALDISS-SDS-PhD-school/blob/master/Day_03_04_Portfolio_networks_exercise_instagram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

In [0]:
# STandard stuff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import itertools # Python's amazing iteration & combination library

In [0]:
# For visualization
!pip install -U bokeh
!pip install -q holoviews

# Import the libraries and link to the bokeh backend
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show

# Setting the default figure size a bit larger
defaults = dict(width=750, height=750, padding=0.1,
                xaxis=None, yaxis=None)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))

In [0]:
# Network Stuff
import networkx as nx
import community # `python-louvain` is implemented here
from networkx.algorithms import bipartite # bipartite NW algos

In [0]:
# Blockmodel Stuff
!wget https://github.com/CALDISS-AAU/sdsphd19_coursematerials/raw/master/wednesday_network-blockmodeling/blockmodeling_material.zip # downloading module and data files to googe drive session
!unzip 'blockmodeling_material.zip' # unzipping

# import the necessary modules
import blockmodeling as bm
import matplotlib.pyplot as plt
import scipy as sc
import scipy.cluster.hierarchy as sch

In [0]:
# API&Scraping&instagramm
!pip3 install instaloader # Installing instaloader
import instaloader
L = instaloader.Instaloader()

import requests as rq # The requests library handles "requests" to APIs similar to a browser that requests a webpage given a URL
from nltk.tokenize import TweetTokenizer # A bit of a transition into NLP. The tweet tokenizer from the NLTK library will help us extract the hashtags from post-text
tknzr = TweetTokenizer()


# Task

So guys, now its time to put it all together. Take the two notebooks by Carl, and Daniel, and Carl, and do the following:

1. Extract Instagram Tag infos of your choice
2. Generate a bipartite User-Tag network
3. Project it on either the user or the tag mode (your choice)
4. Apply blockmodeling on it 

* [Networks general: Daniel](https://colab.research.google.com/github/CALDISS-AAU/sdsphd19_coursematerials/blob/master/notebooks/CALDISS_PHD_Intro_networks.ipynb#&offline=true&sandboxMode=true)
* [Blockmodels: Carl](https://colab.research.google.com/github/CALDISS-AAU/sdsphd19_coursematerials/blob/master/wednesday_network-blockmodeling/Lab_Blockmodeling.ipynb#&offline=true&sandboxMode=true)

# Getting the data

In [0]:
# Instagram base url preffix
tagurl_prefix = 'https://www.instagram.com/explore/tags/'

# suffix to append to tag request url to retrieve data in JSON format
tagurl_suffix = '/?__a=1'

# suffix to end cursor when requesting posts by tag
tagurl_endcursor = '&max_id='

# a generic media post preffix (concat with media shortcode to view)
posturl_prefix = 'https://www.instagram.com/p/'

In [0]:
#
# Find your own instagramm tag to explore!!!!!
#
tags = ['namaalborg']

In [0]:
# urls to initial tags using the above url-components
queries = [ tagurl_prefix + tag + tagurl_suffix for tag in tags ]

In [0]:
edges = []
for q in queries:    
    for i in range(5): # how many iterations/deepth ?
      r = rq.get(q).json()
      end_cursor = r['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
      edges.extend(r['graphql']['hashtag']['edge_hashtag_to_media']['edges'])
      print(i)
      q = q + tagurl_endcursor + end_cursor

In [0]:
post_dicts = [] #empty list

for post in edges: #iterate all raw posts

  if post['node']['edge_media_to_caption']['edges'] == []: # hop to the next if no text in the post
    continue
    
  post_dict = {} # empty dictionary
  id_owner = post['node']['owner']['id'] # pick out user-id
  shortcode = post['node']['shortcode'] # pick out short post identifier
  text = post['node']['edge_media_to_caption']['edges'][0]['node']['text'] # pick out post text
  
  # Pick hashtags from text
  tokens = tknzr.tokenize(text)
  tags = [x.strip('#') for x in tokens if x.startswith('#')]

  # fill in dictionary with values
  post_dict['id_owner'] = id_owner
  post_dict['shortcode'] = shortcode
  post_dict['tags'] = tags
  post_dict['text'] = text

  post_dicts.append(post_dict) #append the dictionary to a list of post-dictionaries

In [0]:
# Create DF
posts_df = pd.DataFrame(post_dicts)

# Remove hashtags that are not a hashtag (emptyspace & mistakes)
posts_df['tags'] = posts_df['tags'].map(lambda t: [x for x in t if x.isalnum()])

# Kick out posts with 0 hashtags
posts_df = posts_df[posts_df['tags'].map(len) != 0]

## Simple stats

In [0]:
# People with most posts
posts_df['id_owner'].value_counts()

In [0]:
# Look up who these people are (this line gets us also other information about the user)
profile = instaloader.Profile.from_id(L.context, 420900264)
# nigerian.pvc.awareness
# rinathama.m
# henry_expert_trade
# inuka_consultant

In [0]:
profile.username

# Create a graph

In [0]:
# Create a new graph
B = nx.Graph()
# We need to specify the nodes for level 0 - this will be our users
B.add_nodes_from(list(set(posts_df.id_owner)), bipartite= 1)
# Then we need to add hashtags nodes as level 1 nodes
B.add_nodes_from(list(set(itertools.chain(*posts_df.tags))), bipartite= 0)

In [0]:
# This quick loop will generate edges between users and hashtags
# Every time someone mentions a #hashtag, a link is created

bi_edges = []
for i in posts_df[['id_owner','tags']].iterrows(): # we do this row-by-row since each row is a post
  id_owner = i[1]['id_owner']
  for j in i[1]['tags']:
    bi_edges.append((id_owner, j)) # edges are appended to a list as a tuple (id_owner, hashtag)

# Let's add the edges to our graph
B.add_edges_from(bi_edges)

In [0]:
# Extract a set of nodes with level 0
top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}

# the remaining nodes are then level 1
bottom_nodes = set(B) - top_nodes

## Preprocessing the Graph

It can be a good idea to filter the Graph before analysing. For instance, we can remove all hashtags with low degree-centrality. This can be interpreted as - kicking out made up hashtags or extremely underused ones. We will calculate a percentile threshold and exclude everything under it.

In [0]:
# Calculating degree centrality for the Graph
degree_centrality = nx.degree_centrality(B)

In [0]:
# Getting a "reasonable" lower bound.
perc_filter = np.percentile([v for u,v in degree_centrality.items()], 5)

In [0]:
# Make a subgraph based on nodes with a degree_centrality over the threshold
nodes_selected = [x for x,y in degree_centrality.items() if y >= perc_filter]

B = B.subgraph(nodes_selected)

## Analysing the Graph

Now we are going to calculate some network indicators and once done, we will export a DataFrame analyse them further.

In [0]:
# Recalculate degre-centrality and assign it as a node-attribute
degree_centrality = nx.degree_centrality(B)
nx.set_node_attributes(B, degree_centrality, 'degree')

In [0]:
# Same for Eigenvector Centrality
eigenvector = nx.eigenvector_centrality(B)
nx.set_node_attributes(B, eigenvector, 'eigenvector_centrality')

In [0]:
# Same for community detection
communities = community.best_partition(B, resolution = 1)
nx.set_node_attributes(B, communities, 'community')

In [0]:
graph_df = pd.DataFrame(dict(B.nodes(data=True))).T

In [0]:
graph_df['community'].value_counts(normalize=True)

In [0]:
# Find the 5 most popular hashtags for each identified community
tag_per_com = graph_df.groupby('community')['degree'].nlargest(5)

# Assign the plot to an object and save the visual output.

In [0]:
# Let's project this graph using a weighted projection
G_proj = bipartite.weighted_projected_graph(B, top_nodes)

Saving the file is simple, and can be processed in GEPHI

In [0]:
nx.write_gexf(G_proj, 'nam.gexf')

Now lets do some blogmodelling :)

In [0]:
# Convert to matrix (numpy.matrix)
mat=nx.adjacency_matrix(B).todense()

# Extract the node labels into a list - useful later on
nodelabels=list(B.nodes())

# Display the sociomatrix
bm.displaySociomatrix(mat,nodelabels)

In [0]:
# Display the network (you might get a warning message the first time - then do it again)
nx.draw(B,with_labels=True,node_color='#FFA0A0', node_size=500)
plt.show()


In [0]:
# Calculate indirect structural equivalence (Hamming distances)
dist=bm.indirectSEhamming(posts_df) # Also works with bm.indirectSE(mat,method='hamming')

# or calculate correlation-based indirect structural equivalence instead
#corr=bm.indirectSEcorr(mat) # Also works with bm.indirectSE(mat,method='corr')
# As the clustering functionality here works for distances, we need to then convert correlations to distances
# I have added a function for this: corr2dist(mat) - so if you do the following:
#dist=bm.corr2dist(corr)
# ...you should have something that will work as we continue

In [0]:
# Display the distance matrix if you want
print(dist)

In [0]:
# In order to do hierarchical clustering on these distance values, this must first be converted to a condensed distance matrix
# For this, we use scipy.spatial.distance.squareform:

dist_cond = sc.spatial.distance.squareform(dist)

In [0]:
# When doing (agglomerative/bottom-up) hierarchical clustering, we can use one out of many different methods for clustering
# 'single-link' is typically NOT recommended; 'complete', 'average' and 'ward' are more common in this context

# Using the (non-weighted) average clustering approach, we create our clustering object
Z=sch.linkage(dist_cond, method='complete')

In [0]:
# Then we can plot the dendrogram for this particular hierarchical clustering
plt.figure(figsize=(10, 7))
sch.dendrogram(Z,labels=nodelabels) # ...using the nodelabels we extracted in the beginning
plt.show()

In [0]:
# The dendrogram should assist us in choosing a suitable cutoff in the dendrogram
# This cutoff will thus specify the number of positions our blockmodel will have,
# and which nodes/actors that will be part of different partitions


# We use the fcluster function to "cut" the dendrogram at a suitable level. First set that threshold/cutoff value:
threshold=11

partition = sch.fcluster(Z,threshold,'distance')

# This 'partition' object is a 1-dimensional array (numpy.ndarray), indicating (for each node) which position it belongs to

# Have a look at it:
print(partition)

In [0]:
# Check that you got the number of partitions that you wanted
len(np.unique(partition))

In [0]:
# An alternative (and better) way to store this partition is in the form of a dictionary, of lists
# In the blockmodeling library, I have created a function that generates such a dictionary, from your partition list

blockdict=bm.createBlockdict(partition)

# Display it and you will see
bm.displayBlockdict(blockdict,nodelabels)

In [0]:
# Finally, given your original sociomatrix (max),
# the partition you have found (blockdict), and
# the nodelabels, you can display the final blockmodel

bm.displayBlockmodel(mat,blockdict,nodelabels)

In [0]:
# To assist in interpreting this, I have created a function for
# calculating block densities. It returns a numpy ndarray, containing the
# block densities for the partition specified in blockdict
# This blockimage is sorted according to the indices of blockdict,
# i.e. following the same order as the blockmodel above

densityBI=bm.calcDensityBlockimage(mat,blockdict)

print(np.around(densityBI,decimals=2))