In [None]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

# The New York Social Graph

[New York Social Diary](http://www.newyorksocialdiary.com/) provides a
fascinating lens onto New York's socially well-to-do.  The data forms a natural social graph for New York's social elite.  Take a look at this page of a recent [run-of-the-mill holiday party](http://www.newyorksocialdiary.com/party-pictures/2014/holiday-dinners-and-doers).

Besides the brand-name celebrities, you will notice the photos have carefully annotated captions labeling those that appear in the photos.  We can think of this as implicitly implying a social graph: there is a connection between two individuals if they appear in a picture together.

For this project, we will assemble the social graph from photo captions for parties dated December 1, 2014, and before.  Using this graph, we can make guesses at the most popular socialites, the most influential people, and the most tightly coupled pairs.

We will attack the project in three phases:
1. Get a list of all the photo pages to be analyzed.
2. Parse all of the captions on a sample page.
3. Parse all of the captions on all pages, and assemble the graph.

## Phase One

The first step is to crawl the data.  We want photos from parties on or before December 1st, 2014.  Go to the [Party Pictures Archive](http://www.newyorksocialdiary.com/party-pictures) to see a list of (party) pages.  We want to get the url for each party page, along with its date.

We use Python Requests to download the HTML pages, and BeautifulSoup to process the HTML.


First we try to get the number of party pages for the 95 months (that is, month-year pair) in the data.  

In [None]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from collections import Counter
import numpy as np
import requests
import dill
from bs4 import BeautifulSoup
from datetime import datetime
import requests
from requests_futures.sessions import FuturesSession

#nltk.download()
import nltk
url = "http://www.newyorksocialdiary.com/party-pictures"
response = requests.get(url)
from requests_futures.sessions import FuturesSession

#this is a function to get the link and the date
def get_link_date(el):
    #get the link
    the_link = 'http://www.newyorksocialdiary.com' + el.a['href']   
    #get the date   
    #first child span
    first_span = el.find('span', attrs={'class': 'views-field-created'})
    #seond child spand that is actually the date of the party
    second_span = first_span.find('span', attrs = {'class' : 'field-content'}).text.encode('utf-8')
    #this following party_time retunrs the dates as strings
    #party_date = datetime.strptime(second_span, '%A, %B %d, %Y').strftime('%Y, %m, %d')
    #following returns the dates as date times
    party_date = datetime.strptime(second_span, '%A, %B %d, %Y')
    return [the_link, party_date]    
    #date_list = [date.month, date.day, date.year]
 

#running the following we can get the links and the dates in the first page
def get_links(r):
    #is the next line just for debugging?
    r.text[:1000] + "..."
    soup = BeautifulSoup(r.text, "lxml")
    l = soup.find_all('div', attrs={'class':'views-row'})
    links = []
    for link in l:
        links.append(get_link_date(link))
    return links

#filter the dates
def filter_by_date(links, cutoff=datetime(2014, 12, 1)):
    before2014 = []
    for l in links:
        if l[1] <= cutoff:
            before2014.append(l)
    return before2014


page1 = get_links(response)
page1_before2014 = filter_by_date(page1 , cutoff=datetime(2014, 12, 1))
assert len(filter_by_date(get_links(response))) == 0

#all the links in all the pages that are related to before 2014
link_list = []
session = FuturesSession(max_workers=5)
urls = ["http://www.newyorksocialdiary.com/party-pictures?page=" + str(i) for i in range(29)]
futures = [session.get(url) for url in urls]
for future in futures:
    link_list.extend(get_links(future.result()))
    
link_list = filter_by_date(link_list, cutoff=datetime(2014, 12, 1))

dill.dump(link_list, open('nysd-links.pkd', 'w'))
#link_list is saved in the file so the next time you want the link_list
#you can just run the following
link_list = dill.load(open('nysd-links.pkd', 'r'))


date_of_parties = [l[1] for l in link_list]
month_of_parties = [d.strftime('%b-%Y') for d in date_of_parties]

link_list_months = []

for i in range(len(link_list)):
    link_list_months.append([link_list[i][0], month_of_parties[i]])

#histogram
from collections import Counter
def histogram():   
     return list(Counter(month_of_parties).items())
    
histogram()

## Phase Two

In this phase, we we concentrate on getting the names out of captions for a given page.It means that we need to parse the names out of the caption.

In [None]:
#function to get the captions according to the URL
def get_captions(url):
    response = requests.get(url)
    response.text[:1000] + "..."
    soup = BeautifulSoup(response.text, "lxml")
    caps = soup.find_all('div', attrs={'class':'photocaption'})
    list_of_captions = []
    for tag in caps:
        list_of_captions.append(tag.text.strip().encode('utf-8'))
    return list_of_captions

#get the data for the specified URL
url1 = "http://www.newyorksocialdiary.com/party-pictures/2015/celebrating-the-neighborhood"
raw = get_captions(url1)
#dump the content in a file so that you do not have to access and get the captions from the page every time
dill.dump(raw, open('raw.pkd', 'w'))

In [None]:
#read the data from the file
names = dill.load(open('raw.pkd', 'r'))
#first split each caption
import re
for i in range(len(names)):
    names[i]= re.sub(r'\, ', ' & ', names[i]).split('&')

#if there are just two names in a caption that are seprerate with an and, split
for i in range(len(names)):
    if len(names[i]) == 1:
        names[i] = names[i][0].split(' and ')
#get rid of the and before the last name in each caption       
for i in range(len(names)):
    names[i][-1] = re.sub(r'^ and ','', names[i][-1])

In [None]:
#in the names, each caption is in the form a list, in order to be able to use nltk which works with strings, let's make 
#a string from each caption
n = []
for caption in names:
    n.extend(caption)
import os
os.environ["STANFORD_MODELS"] = "/home/vagrant/ner/stanford-ner-2016-10-31"
import nltk.tag.stanford as st
tagger = st.StanfordNERTagger("/home/vagrant/ner/stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz","/home/vagrant/ner/stanford-ner-2016-10-31/stanford-ner.jar")


In [None]:
nltk_results = []
for i in range(len(n)):
    nltk_results.append(tagger.tag(n[i].split()))

In [None]:
names_page1= [[word for (word, tag) in name if tag == u'PERSON' ] for name in nltk_results]
names_page1_formatted = [' '.join(name).encode('utf-8') for name in  names_page1]
names_page1_formatted = [name for name in names_page1_formatted if name!='' ]
#output is to get the unique names
output = []
for x in names_page1_formatted:
    if x not in output:
        output.append(x)
print output
# Scraping all of the pages could take 10 minutes or so.

## Phase Three

For the remaining analysis, we think of the problem in terms of a network or a graph. Any time a pair of people appear in a photo together, that is considered a link. What we have described is more appropriately called an (undirected) multigraph with no self-loops but this has an obvious analog in terms of an undirected weighted graph. In this problem, we will analyze the social graph of the new york social elite. 

after completing phase one and two, we have ended up with over 100,000 captions and more than 110,000 names, connected in about 200,000 pairs.

The simplest question to ask is "who is the most popular"? The easiest way to answer this question is to look at how many connections everyone has. The following piece of code returns the top 100 people and their degree. We should remember that if an edge of the graph has weight 2, it counts for 2 in the degree.

In [None]:
import heapq  # Heaps are efficient structures for tracking the largest
              # elements in a collection.  Use introspection to find the
              # function you need.
import networkx as nx
all_names = dill.load(open('all_names1.pkd', 'r'))
G = nx.MultiGraph()
from itertools import combinations
def degree():
    for page in all_names:
        for caption in page:
            connections = list(combinations(caption,2))
            G.add_edges_from(list(combinations(caption,2)))
    A = sorted(G.degree().items(), key = lambda x: -x[1])[:100]
    return A

A similar way to determine popularity is to look at their pagerank. Pagerank is used for web ranking and was originally patented by Google and is essentially the stationary distribution of a markov chain implied by the social graph.
We can use 0.85 as the damping parameter so that there is a 15% chance of jumping to another vertex at random.

In [None]:
import itertools  # itertools.combinations may be useful
import networkx as nx
from itertools import combinations
all_names = dill.load(open('all_names1.pkd', 'r'))

def pagerank():
    G = nx.MultiGraph()
    for page in all_names:
        for caption in page:
            connections = list(combinations(caption,2))
            G.add_edges_from(list(combinations(caption,2)))
        
    H = nx.Graph()
    for u,v,w in G.edges(data=True):
        if H.has_edge(u,v):
            H[u][v]['weight'] += 1
        else:
            H.add_edge(u,v,weight=1)
    ranks = nx.pagerank(H)
    A = sorted(ranks.items(), key = lambda x: -x[1])[:100]
    perfect_rank = []   
    for i in range(100):
        perfect_rank.append(A[i])
    return perfect_rank

Another interesting question is who tend to co-occur with each other. Following piece of code gives uf the 100 edges with the highest weights.

In [None]:
G = nx.MultiGraph()
for page in all_names:
    for caption in page:
        connections = list(combinations(caption,2))
        G.add_edges_from(list(combinations(caption,2)))

H = nx.Graph()
for u,v,w in G.edges(data=True):
    if H.has_edge(u,v):
        H[u][v]['weight'] += 1
    else:
        H.add_edge(u,v,weight=1)
ranks = nx.pagerank(H)

In [None]:
def best_friends():   
    connections =  sorted(H.edges(data=True), key=lambda(source,target,data): -data['weight'])[:100]
    l = []
    for (u, v, w) in connections:
         l.append(((u,v),w['weight']))
    return l