# Integ 475- Tumblr Network Project

### Getting Started

In [1]:
# Import relevant programs and functions

import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import string
import re


In [2]:
# Assigns variable "url" to url of post 
# Assigns variable "urlhandle" to url of post's source blog (obtained from post url using regex)
# Post's url is accessed, and the content turned into a BeautifulSoup object 
# From here, all content held within "ol" (ordered list) tags is located and added to the list 'sections2' 
# (There should only be one item this list)
# Next, all content found within "li" (list item) tags from the 'ordered list' string is added to the list "sections"
# Each element in "sections" is a 'note' (a like/reblog of the post)

url = "http://eurosong.tumblr.com/post/160511885918/good-afternoon-folks-and-welcome-to-todays"  
urlhandle = (re.findall('http[s]*://[A-Za-z-]+\.tumblr\.com/', url))[0] 

html = urlopen(url)
content = BeautifulSoup(html,"html5lib")                               
sections2 = content.findAll("ol")
sections=sections2[0].findAll("li")



### Scrapping all the Notes

When a user opens a post's webpage, they will not be able to see all the post's notes right away. They will only see 50. 

While not available on every blog, most blogs will, however, have a feature that allows you to see beyond these inital 50 notes.
By clicking "See More Notes", the page will update with 50 more notes that it extracted from another webpage. By clicking this option repeatedly, you will get increasingly more notes until you reach a maximum. This maximum, appears to vary by both post and by blog. 

While the additional notes are visible on the user end, the information will not update in the source code. To scrape these additional notes, one has to access the other webpage the original page extracted information from.


### Each element in sections, has a certain number of items held in "a" tags based on what type of note it codes for:
    # 1 = Not actually a note, but an option to "Show More Notes"
    # 2 = A like or a line stating "Username posted this" (this only appears once and only as the last note)
    # 3 = A reblog with no comment
    # 4 = A reblog with a comment 


In [3]:
x= sections[-1].findAll("a") # Creates a list of all elements held in "a" tags for the last note

# It is important to note that is the length of list "x" is 1, then it was not a note but rather an option to "Show More Notes"
# By continuously checking for this and adding the notes on the webpage it leads to to our list, we can extract the maxmimum number of notes

while len(x)==1: 
    y=str(x[0])
    y= (re.findall('notes/[0-9]+/[A-Za-z0-9]+\?from_c=[0-9]+', y))[0] #Uses regex to find the url to the webpage that contains the additional notes
    nextpage= urlhandle+y
    
    html=urlopen(nextpage)
    content=BeautifulSoup(html,"html5lib")
    sectionsnext=content.findAll("li")
    
    if len(sectionsnext)>0: #Occasionally, a page will include a "Show More Notes" option when none are available. 
                            # By making sure we actually extracted content from the new page (if len(sectionsnext) > 0)
                            # We make sure we don't create an error trying to call elements that don't exist
                
        x=sectionsnext[-1].findAll("a")
        
        for i in range(len(sectionsnext)): # All notes from the new list are added to the original list
            sections.append(sectionsnext[i]) # And the cycle repeats until the last note in our list finally has 2 elements in contained in "a" tags
                                             # Meaning (since its the very last item) it is the "Username posted this" line
            
    else:
        x=[]
    
    continue 

In [4]:
for i in range(len(sections)):
    x=sections[i].findAll("a")

### Reblogs

In [5]:
reblogs=[]

# Creates a list of where is element is a list containing the...
# source username, source url, recipient username, recipient url, and comment content (if any) of a single reblog

for i in range(len(sections)):
    x=sections[i].findAll("a") # Checks each element in list "sections" if it has > 2 items in "a" tags (which means its a reblog)
    if len(x)>2:
        row=[]
        
        user=x[1].get_text() #By removing all the html formatting code of the 2nd item held in "a" tags, you get the recipient username
        link=x[1].attrs['href'] #By looking for content with the attribute 'href' in the 2nd item held in "a" tags, you get the recipient url
        
        user2=x[2].get_text() # Same as above, but you work with the 3rd item and get the source username and source url
        link2=x[2].attrs['href']
        
        row.append(user) # The recipient and source information is then added to the list 'row' (which will make up one row in the dataframe)
        row.append(link)
        row.append(user2)
        row.append(link2)
        
        if len(x)>3: # If there are > 3 items in "a" tags, we know a comment was added during the reblog
            comment = x[3].get_text() # This allows us to extract the comment, clean it, and append it as the 5th item in "row"
            comment= re.sub('\n+', " ", comment) #Removes excess new line characters                           
            comment = re.sub(' +', " ", comment) #Removes excess spaces   
            row.append(comment)
            
        else:
            row.append("") # If no comment was added, the string "None" is simply added as the 5th item
        
        reblogs.append(row)
        
    else:
        continue

In [6]:
reblogs

[['iriaowillwinesc',
  'https://iriaowillwinesc.tumblr.com/',
  'arabela25',
  'http://arabela25.tumblr.com/',
  'None'],
 ['xiithdoctor',
  'http://xiithdoctor.tumblr.com/',
  'mapsontheweb',
  'https://mapsontheweb.zoom-maps.com/',
  'None'],
 ['adventurenerd',
  'http://adventurenerd.tumblr.com/',
  'clickthing',
  'http://clickthing.tumblr.com/',
  'None'],
 ['slenderboobs',
  'http://slenderboobs.tumblr.com/',
  'mapsontheweb',
  'https://mapsontheweb.zoom-maps.com/',
  'None'],
 ['wherethefuckdidthisshitcomefrom',
  'https://wherethefuckdidthisshitcomefrom.tumblr.com/',
  'mapsontheweb',
  'https://mapsontheweb.zoom-maps.com/',
  'None'],
 ['magrittesflamingtuba',
  'http://magrittesflamingtuba.tumblr.com/',
  'mapsontheweb',
  'https://mapsontheweb.zoom-maps.com/',
  'None'],
 ['thatdysfunctionalkingdom',
  'https://thatdysfunctionalkingdom.tumblr.com/',
  'mapsontheweb',
  'https://mapsontheweb.zoom-maps.com/',
  'None'],
 ['monterrang-parkin',
  'https://monterrang-parkin.tumb

### Finding Non-Reblog Comments

In [7]:
comments=[]
indexs=[]

for i in range(len(sections)):
    x=sections[i].get_text()
    text=x.split(" ")
    
    if text[1]=="said:":
        row=[]
        
        user=text[0]
        del text[:2]
        text=" ".join(text)
        text= re.sub('\n+', " ", text) #Removes excess new line characters                           
        text = re.sub(' +', " ", text) #Removes excess spaces   
    
        row.append(user)
        
        t=sections[i].findAll("a")
        url=t[1].attrs['href']
        row.append(url)
        
        row.append(text)
        
        comments.append(row)
        indexs.append(i)
        
for i in range(len(indexs)):
    del sections[indexs[i]]

In [8]:
comments

[]

### Finding Likes

In [9]:
likes=[]

for i in range(len(sections)): # For each note in sections, use findAll() to locate all content held within "a" tags
    x=sections[i].findAll("a") # If each note had two strings held within "a" tags, it is a like. 
    if len(x) ==2: 
        row=[]                      
        
        user=x[1].get_text() # Removes html formatting code from the 2nd item held in "a" tags to get the username of the individual liking the post
        link=x[1].attrs['href'] # Looks for an item with the attribute "href" from the 2nd item held in "a" tags to get the user's blog url
        
        row.append(user) # Adds the username and url to a list named 'row' 
        row.append(link)
        
        likes.append(row) # The list 'row' is appended to the list 'likes'
        
    else:
        continue

In [10]:
originalposter=likes[-1] # The last element in the 'likes' list is not a like, but rather the information of the user who published the post
del likes[-1] #This information is assigned the variable 'originalposter' and then deleted from the 'likes' list

In [11]:
likes

[['notacolorfullesbian', 'https://notacolorfullesbian.tumblr.com/'],
 ['adelina-popa', 'http://adelina.rbiz.ro/'],
 ['myanarchistproseandpoetry',
  'https://myanarchistproseandpoetry.tumblr.com/'],
 ['super-rami-malek-imagines', 'http://super-rami-malek-imagines.tumblr.com/'],
 ['danasolez', 'https://danasolez.tumblr.com/'],
 ['paramarketing', 'https://paramarketing.tumblr.com/'],
 ['theoldcrone', 'https://theoldcrone.tumblr.com/'],
 ['brendabelanger', 'https://brendabelanger.tumblr.com/'],
 ['wendijordan', 'https://wendijordan.tumblr.com/'],
 ['subgoogle', 'https://subgoogle.tumblr.com/'],
 ['potatoesrule4ever', 'https://potatoesrule4ever.tumblr.com/'],
 ['tassledown', 'https://tassledown.tumblr.com/'],
 ['linamory', 'http://linamory.tumblr.com/'],
 ['dorkynanni', 'http://dorkynanni.tumblr.com/'],
 ['slenderboobs', 'http://slenderboobs.tumblr.com/'],
 ['mrfarfalle', 'https://mrfarfalle.tumblr.com/'],
 ['claudiopompeo', 'http://claudiopompeo.tumblr.com/'],
 ['sidsloth101', 'https://sid

### Creating the Like and Reblog Dataframes

In [12]:
df_reblog=pd.DataFrame(reblogs) # Creates the reblog dataframe
df_reblog.columns=["Recipent User", "Recipent URL", "Source User", "Source URL", "Comment"] # Assigns column titles

In [13]:
## Number of Reblogs
df_reblog.shape[0]

Unnamed: 0,Recipent User,Recipent URL,Source User,Source URL,Comment
0,iriaowillwinesc,https://iriaowillwinesc.tumblr.com/,arabela25,http://arabela25.tumblr.com/,
1,xiithdoctor,http://xiithdoctor.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,
2,adventurenerd,http://adventurenerd.tumblr.com/,clickthing,http://clickthing.tumblr.com/,
3,slenderboobs,http://slenderboobs.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,
4,wherethefuckdidthisshitcomefrom,https://wherethefuckdidthisshitcomefrom.tumblr...,mapsontheweb,https://mapsontheweb.zoom-maps.com/,


In [14]:
df_likes=pd.DataFrame(likes) # Creates the likes dataframe
df_likes.columns=["Tumblr User", "Tumblr User URL"] # Assigns column titles

In [15]:
## Number of Likes
df_likes.shape[0]

Unnamed: 0,Tumblr User,Tumblr User URL
0,notacolorfullesbian,https://notacolorfullesbian.tumblr.com/
1,adelina-popa,http://adelina.rbiz.ro/
2,myanarchistproseandpoetry,https://myanarchistproseandpoetry.tumblr.com/
3,super-rami-malek-imagines,http://super-rami-malek-imagines.tumblr.com/
4,danasolez,https://danasolez.tumblr.com/


### Creating the Networks Dataframe 

In [16]:
for i in range(len(reblogs)): # Each element in the 'reblogs' list is a list containing all information for one reblog 
    reblogs[i].append(1)      # Appending "1' to each reblog's list allows us to signify the existence of a reblog relationship between the source and recipient user   
    x=0
        
    for t in range(len(likes)): # For each like in the 'likes' list
    
        if reblogs[i][1]== likes[t][1]: # We check if the Tumblr user of the like matches with any of the recipient users in the reblog list
            x=1                         # If yes, the x is set to "1" (The default was 0)
            del likes[t]                # And the like is deleted from the likes list 
            break                       # Once a match is found, loop breaks and the search ends 
            
        else:
            continue 
            
    
    reblogs[i].append(x) # The "x" (which is either a 1 or 0 to signify the existence/non-existence of a like relationship) is appended to the reblog 
            

In [17]:
for i in range(len(likes)): # For the Tumblr Users who didn't also reblog, we add additional information so they can be merged with the 'reblog' dataframe
   
    likes[i].append(originalposter[0])
    likes[i].append(originalposter[1])
    likes[i].append("") # Indicates that there were no comments (You cannot add a comment when you like on Tumblr)
    likes[i].append(0) # Indicates non-existence of a reblog relationship
    likes[i].append(1) # Indicates existence of a like relationship
    
    reblogs.append(likes[i])



In [18]:
df_network=pd.DataFrame(reblogs)


In [19]:
df_network.columns = ["Recipent User", "Recipent URL", "Source User", "Source URL", "Comment", "Reblog", "Like"]
df_network

Unnamed: 0,Recipent User,Recipent URL,Source User,Source URL,Comment,Reblog,Like
0,iriaowillwinesc,https://iriaowillwinesc.tumblr.com/,arabela25,http://arabela25.tumblr.com/,,1,0
1,xiithdoctor,http://xiithdoctor.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,,1,0
2,adventurenerd,http://adventurenerd.tumblr.com/,clickthing,http://clickthing.tumblr.com/,,1,0
3,slenderboobs,http://slenderboobs.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,,1,1
4,wherethefuckdidthisshitcomefrom,https://wherethefuckdidthisshitcomefrom.tumblr...,mapsontheweb,https://mapsontheweb.zoom-maps.com/,,1,0
5,magrittesflamingtuba,http://magrittesflamingtuba.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,,1,0
6,thatdysfunctionalkingdom,https://thatdysfunctionalkingdom.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,,1,0
7,monterrang-parkin,https://monterrang-parkin.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,@ph0220,1,1
8,wymanthewalrus,http://wymanthewalrus.tumblr.com/,mapsontheweb,https://mapsontheweb.zoom-maps.com/,,1,1
9,alwaysfriedstudent-b07d5fd2,https://alwaysfriedstudent-b07d5fd2.tumblr.com/,eurosong,http://eurosong.tumblr.com/,,1,1


In [20]:
columns = ["Recipent URL", "Source URL"]
df_network.drop(columns, inplace=True, axis=1)


In [21]:
df_networkshort = df_network.reindex_axis(["Comment","Reblog", "Like", "Recipent User", "Source User"], axis=1)

In [23]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(df_networkshort)

Unnamed: 0,Comment,Reblog,Like,Recipent User,Source User
0,,1,0,iriaowillwinesc,arabela25
1,,1,0,xiithdoctor,mapsontheweb
2,,1,0,adventurenerd,clickthing
3,,1,1,slenderboobs,mapsontheweb
4,,1,0,wherethefuckdidthisshitcomefrom,mapsontheweb
5,,1,0,magrittesflamingtuba,mapsontheweb
6,,1,0,thatdysfunctionalkingdom,mapsontheweb
7,@ph0220,1,1,monterrang-parkin,mapsontheweb
8,,1,1,wymanthewalrus,mapsontheweb
9,,1,1,alwaysfriedstudent-b07d5fd2,eurosong
