[New York Social Diary](http://www.newyorksocialdiary.com/) provides a
fascinating lens onto New York's socially well-to-do.  The data forms a natural social graph for New York's social elite.  Take a look at this page of a recent [run-of-the-mill holiday party](http://www.newyorksocialdiary.com/party-pictures/2014/holiday-dinners-and-doers).

Besides the brand-name celebrities, you will notice the photos have carefully annotated captions labeling those that appear in the photos.  We can think of this as implicitly implying a social graph: there is a connection between two individuals if they appear in a picture together.

In [1]:
%matplotlib inline
import matplotlib
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

In [1]:
import requests
import dill
from bs4 import BeautifulSoup
from datetime import datetime

In [5]:
page= requests.get('http://www.newyorksocialdiary.com/party-pictures')

Now, we process the text of the page with BeautifulSoup.

In [6]:
soup = BeautifulSoup(page.text, "lxml")

In [6]:
links = ...
links =[]
a=soup.find_all(lambda tag: tag.name == 'span' and tag.get('class') == ['field-content'])
for tag_a in a:
    link_a=tag_a.find_all('a')
    if link_a==[]:
        link_date=tag_a.text.split(', ',1)[1]
    if link_a!=[]:
        link_b=link_a[0]
        continue
    links.append((link_b['href'],link_date))

In [7]:
assert len(links) == 50

In [8]:
link = links[0]
# Check that the title and date match what you see visually.

In [9]:
def get_link_date(el):
    url='http://www.newyorksocialdiary.com'+el
    page_url= requests.get(url)
    soup_url = BeautifulSoup(page_url.text, "lxml")
    date=soup_url.find_all("div", {"class": "panel-pane pane-node-created"})
    try:
        date=date[0].text.replace("\n                        ", "")
    except:
        date='Tuesday, October 11, 2016          '
    date=date.replace("          ", "")
    date=date.split(', ', 1)[1]
    date=datetime.strptime(date, '%B %d, %Y')
    return url, date

In [10]:
def get_links(response):
    soup = BeautifulSoup(response.text, "lxml")
    links =[]
    a=soup.find_all(lambda tag: tag.name == 'span' and tag.get('class') == ['field-content'])
    for tag_a in a:
        link_a=tag_a.find_all('a')
        if link_a!=[]:            
            link_b=link_a[0]
            links.append(link_b['href'])
    list_info=[]
    for i in range(0,len(links)):        
        list_info.append(tuple(get_link_date(links[i])))
    return list_info # A list of URL, date pairs

In [11]:
def get_links(response):
    soup = BeautifulSoup(response.text, "lxml")
    list_info=[]
    a=soup.find_all(lambda tag: tag.name == 'span' and tag.get('class') == ['field-content'])
    for tag_a in a:
        link_a=tag_a.find_all('a')
        if link_a==[]:
            link_date=tag_a.text.split(', ',1)[1]
        if link_a!=[]:
            link_b=link_a[0]
            continue
        list_info.append((link_b['href'],datetime.strptime(link_date, '%B %d, %Y')))
    return list_info # A list of URL, date pairs

But we only want parties with dates on or before the first of December, 2014.  Let's write a function to filter our list of dates to those at or before a cutoff.  Using a keyword argument, we can put in a default cutoff, but allow us to test with others.

In [13]:
def filter_by_date(links, cutoff=datetime(2014, 12, 1)):
    links_new=[]
    for i in links:
        if i[1]<=cutoff:
            links_new.append(i)
    return links_new
    # Return only the elements with date <= cutoff

In [201]:
from requests_futures.sessions import FuturesSession

link_list = []
session = FuturesSession()
list_future=[]
future = session.get('http://www.newyorksocialdiary.com/party-pictures')
list_future.append(future)
for i in range(0 , 33):
    future = session.get('http://www.newyorksocialdiary.com/party-pictures?page='+str(i))
    list_future.append(future)

In [16]:
link_list=[]
for f_link in list_future:
    link_list.extend(filter_by_date(get_links(f_link.result())))

In [17]:
assert len(link_list) == 1193

In [120]:
dill.dump(link_list, open('nysd-links.pkd', 'wb'))

In [19]:
link_list = dill.load(open('nysd-links.pkd', 'rb'))

Number of party pages for the 95 months (that is, month-year pair) in the data.

In [20]:
s=0
link_list_2=[]
for i in link_list:
    s=s+1
    link_list_2.append((i[1].strftime('%b-%Y'),s-1))
d={}
for w in link_list_2[0:]:
    d[w[0]]=d.get(w[0], 0)+1
count_list=[]
for key, value in d.items():
    temp = [key,value]
    count_list.append(temp)

In [1]:
def histogram():
    s=0
    link_list_2=[]
    for i in link_list:
        s=s+1
        link_list_2.append((i[1].strftime('%b-%Y'),s-1))
    d={}
    for w in link_list_2[0:]:
        d[w[0]]=d.get(w[0], 0)+1
    count_list=[]
    for key, value in d.items():
        temp = [key,value]
        count_list.append(temp)
    return count_list  

Getting the names out of captions for a given page.

In [124]:
captions = []
page= requests.get('http://www.newyorksocialdiary.com/party-pictures/2015/celebrating-the-neighborhood')
soup = BeautifulSoup(page.text, "lxml")
a=soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['photocaption'])
tag_a.find_all('a')
for tag_a in a:
    captions.append(tag_a.text)

['Oscar\n                    Mora for Valentino',
 "Eric\n                    Cohler Design & L'Olivier ",
 'Thomas\n                    M. Burak Interiors, Ltd.',
 'Whitney\n                    and James Fairchild',
 'Dominique\n                    Browning, Robert Rufino, and Naz Tesfit',
 'Gretchen\n                    and Gene Grisanti',
 'Philip\n                    and Lisa Gorrivan',
 'Elizabeth\n                    Stribling and Guy Robinson',
 ' Anne Hubbard\n                  and Michael Lonergan\xa0\n              ',
 'Adrian\n                    Benepe',
 '\n                    Jennifer\n                    Creel and  Ellen Deery',
 'Stephanie\n                    Krieger',
 'David\n                Beahm with Sherri and Larry Babbio',
 ' Nonie Sullivan\n                  and John Sullivan\xa0\n              ',
 'Dylan\n                    Lauren next to her table',
 'John\n                    Thain',
 'Donald\n                    Young',
 'Dominique\n                    Bro

In [637]:
#t = soup.find(class_="label", text=lambda s: "Fiscal" in s and "year" not in s)
mydf2.loc[p,'3']=names_in_line((sas','sas'),('sas','sasasas'))

In [23]:
len(captions)

109

In [4]:
import dill
link_list = dill.load(open('nysd-links.pkd', 'rb'))

In [9]:
import re
import itertools  
import networkx as nx
from itertools import combinations
import requests

#save all captions
path_list=[]
for i in link_list:
    path_list.append(i[0])
captions_all=[]
for path in path_list:
    url='http://www.newyorksocialdiary.com'+path
    page_url= requests.get(url)
    soup_url = BeautifulSoup(page_url.text, "lxml")
    soup_captions=soup_url.select('div[class*="caption"]')
    if soup_captions==[]:
        soup_captions=soup_url.select('td[class*="caption"]')
        if soup_captions==[]:
            soup_captions=soup_url.select('font[face*="Verdana"]')
#    tag_a.find_all('a')
    if soup_captions==[]: print(url)
    p=0
    for tag_a in soup_captions:
        if tag_a.text is '': continue
        if p==0:
            p=1
            continue
        if tag_a.text!=[]: captions_all.append(tag_a.text)
captions_all

http://www.newyorksocialdiary.com/nysd/partypictures


KeyboardInterrupt: 

In [219]:

#save all captions + headers
path_list=[]
for i in link_list:
    path_list.append(i[0])
headers=[]
captions_all=[]
for path in path_list:
    url='http://www.newyorksocialdiary.com'+path
    page_url= requests.get(url)
    soup_url = BeautifulSoup(page_url.text, "lxml")
    soup_captions=soup_url.select('div[class*="caption"]')
    if soup_captions==[]:
        soup_captions=soup_url.select('td[class*="caption"]')
        if soup_captions==[]:
            soup_captions=soup_url.select('font[face*="Verdana"]')
    tag_a.find_all('a')
    if soup_captions==[]: print(url)
    p=0
    for tag_a in soup_captions:
        if tag_a.text is '': continue
        if p==0:
            p=1
            headers.append(tag_a.text)
        captions_all.append(tag_a.text)
captions_all

[]

In [220]:
#future + headers + all captions
path_list=[]
for i in link_list:
    path_list.append(i[0])
#first_page:
path_list=["/party-pictures/2015/celebrating-the-neighborhood"]
headers=[]
captions_all=[]
for path in path_list:
    url='http://www.newyorksocialdiary.com'+path
    future_url= session.get(url)
    page_url=future_url.result()
    soup_url = BeautifulSoup(page_url.text, "lxml")
    soup_captions=soup_url.select('div[class*="caption"]')
    if soup_captions==[]:
        soup_captions=soup_url.select('td[class*="caption"]')
        if soup_captions==[]:
            soup_captions=soup_url.select('font[face*="Verdana"]')
    tag_a.find_all('a')
    if soup_captions==[]: print(url)
    p=0
    for tag_a in soup_captions:
        if tag_a.text is '': continue
        if p==0:
            p=1
            headers.append(tag_a.text)
            continue
        captions_all.append(tag_a.text)


In [355]:
dill.dump(captions_all, open('captions_all_nohead.pkd', 'wb'))

In [753]:
dill.dump(mydf, open('mydf.pkd', 'wb'))

In [802]:
mydf = dill.load(open('mydf.pkd', 'rb'))

In [251]:
import dill
captions_all = dill.load(open('captions_all.pkd', 'rb'))

By our count, there are about 110.  But if you're off by a couple, you're probably okay.

In [252]:
import pandas as pd
import re
captions2=[]
for i in captions_all:
    i=re.sub(r'[^A-Za-z .,'']', '', i)  
    captions2.append(i.strip(' \t\n\r'))
      
captions2=[x for x in captions2 if x!='']
data = {'1': captions2}
mydf = pd.DataFrame.from_dict(data)
mydf.loc[:,'2'] = None
print(mydf['2'][99])
len(captions2)


None


74888

In [56]:
#check lengh change
data = {'before': [None], 'after':[None], 'diff':[None]}
mydf_3 = pd.DataFrame.from_dict(data)
mydf_3.loc[:,'before'] = None
mydf_3.loc[:,'after'] = None
mydf_3['before']
s=0
for i in trimed_captions_all:
    coupled=''
    mydf_3.loc[s,'before']=i
    for j in names_in_line(i):
        coupled=coupled+', '+" ".join(j)
    mydf_3.loc[s,'after']=coupled  
    mydf_3.loc[s,'diff']=len(i)-len(coupled)
    s+=1

In [59]:
mydf_3

Unnamed: 0,before,after,diff
0,"The scene, IDEAL School Academys thAnnual Gala",", School Academys",29
1,"Les Lieberman, Barri Lieberman, Isabel Kallman...",", Les Lieberm, Barri Lieberm, Isabel Kallm, Tr...",7
2,Chuck Grodin,", Chuck Grod",0
3,"Diana Rosario, Ali Sussman, Sarah Boll, Jen Za...",", Diana Rosario, Ali Sussm, Sarah Boll, Jen Za...",6
4,Kelly and Tom Murro,", Tom Murro, Kelly Murro",-5
5,Udo Spreitzenbarth,", Udo Spreitzenbar",0
6,"Ron Iervolino, Trish Iervolino, Russ Middleton...",", Ron Iervolino, Trish Iervolino, Russ Middlet...",2
7,"Barbara Loughlin, , Gerald Loughlin and Debbie...",", Barbara Loughl, Gerald Loughlin, Debbie Gelsto",6
8,Julianne Michelle,", Julianne Michelle",-2
9,"Heather Robinson, Kiwan Nichols, Jimmy Nichols...",", Heather Robinso, Kiwan Nichols, Jimmy Nichol...",4


In [83]:

mydf_3_sorted=mydf_3.sort_values("diff",ascending=[False]).reset_index()
mydf_3_sorted['before'][15]


'Angela Lansbury will make Tony Awards history if she lands a Best Featured Actress prize shell tie Julie Harris for,most acting awards'

In [64]:
mydf3=mydf.drop('2',1)
mydf.pivot_table(index='diff',aggfunc=np.len)

NameError: name 'np' is not defined

In [774]:
cap_line='kl, ohkjgjh Hlkjh Kjhhg jjg Khgjh Igjhh'
list_names=[]
bad_words=['The','Zombie','Private', 'Art', 'Consultant','Executive', 'Vice', 'President',
           'Board', 'Trustee', 'Senior' ,'Partner','External', 'Relations',
           'French','Heritage', 'Society']
for j in re.split(',',cap_line):
    if j=='': continue
    sp=[]
#        if ("'s" in j):
#            j=j.split("'s", 1)[0]
    for bad_word in bad_words:
        if bad_word in j: j=re.sub(bad_word, '', j)
    j=j.lstrip(' and ').rstrip(' and ').rstrip(' and').lstrip('and ')
    j=j.lstrip(' with ').rstrip(' with ').rstrip(' with').lstrip('with ')
    j=re.sub(' +', ' ', j)
    j.lstrip(' ').rstrip(' ')
    if ('and ' in j) or (' and' in j) or (' and ' in j) or ('with ' in j) or (' with' in j) or (' with ' in j):
        if ('and ' in j): my_separator= 'and '
        if (' and' in j): my_separator= ' and'
        if (' and ' in j): my_separator= ' and '
        if ('with ' in j): my_separator= 'with '
        if (' with' in j): my_separator= ' with'
        if (' with ' in j): my_separator= ' with '
        for sp_w in re.split(my_separator,j):
            sp.append(sp_w.rstrip(' ').lstrip(' '))
            sp=list(filter(None, sp))
        if len(sp)==1: continue
        if len(re.split(' ',sp[0]))>1:
            for m in sp:
                list_names.extend(names_in_part(m))
        elif (len(re.split(' ',sp[0]))==1) & (len(re.split(' ',sp[1]))>1):
            m1 = sp[0:1]
            m2 = re.split(' ',sp[1])
            list_names.extend([m2])
            m1.extend(re.split(' ',sp[1])[1:])
            m1=[tuple(m1)]
            list_names.extend(m1)
    else:
        print(j)
        if names_in_part(j) not in list_names:
            list_names.extend(names_in_part(j))
list_names=list(k for k,_ in itertools.groupby(list_names))
list_names

kl
ohkjgjh Hlkjh Kjhhg jjg Khgjh Igj


[('Hlkjh', 'Kjhhg'), ('Khgjh', 'Igj')]

In [93]:
import numpy as np
#mydf[mydf['1'] != mydf['2']]
#mydf2['1'][4]==mydf2['1'][4]
#mydf2['1'][1]
#mydf2
mydf.groupby('1').sum()
mydf.loc[:,'3']=1
my_pivpt=pd.pivot_table(mydf,index=["1"],values = '3',aggfunc=np.sum).sort_values(by='3', ascending=False, na_position='first')
my_pivpt

Unnamed: 0_level_0,3
1,Unnamed: 1_level_1
Click here for NYSD Contents,826
Jean Shafiroff,36
Tory Burch,32
Alexandra Lebenthal,32
Gillian Miniter,31
Amy Fine Collins,30
Debbie Bancroft,29
Jamee and Peter Gregory,29
Jennifer Creel,29
Dayssi Olarte de Kanavos,28


In [22]:
mygroup=mydf.groupby('1').sum()
mygroup['2']

1
,                                                                                                                                                                                                                    0
, , Tony Ingrao                                                                                                                                                                                                      0
, , and                                                                                                                                                                                                              0
, Candi Crawford, , , and Michelle Hunt                                                                                                                                                                              0
, Chris Bernard, and Drew Schiff                                                                                                          

In [782]:
mydf2.loc[:,'3'] = None
p=0
for i in mydf2['2'][:]:
    coupled=''
    for j in names_in_line(i):
        coupled=coupled+', '+" ".join(j)
    mydf2.loc[p,'3']=coupled
    p+=1

TypeError: expected string or bytes-like object

In [9]:
trimed_captions_all
        

['The scene, IDEAL School Academys thAnnual Gala',
 'Les Lieberman, Barri Lieberman, Isabel Kallman, Trish Iervolino and Ron Iervolino',
 'Chuck Grodin',
 'Diana Rosario, Ali Sussman, Sarah Boll, Jen Zaleski, Alysse Brennan and Lindsay Macbeth',
 'Kelly and Tom Murro',
 'Udo Spreitzenbarth',
 'Ron Iervolino, Trish Iervolino, Russ Middleton and Lisa Middleton',
 'Barbara Loughlin, , Gerald Loughlin and Debbie Gelston',
 'Julianne Michelle',
 'Heather Robinson, Kiwan Nichols, Jimmy Nichols, Melanie Carbone and Nancy Brown',
 'Bill Mack and Les Lieberman',
 'David Lyden and Patricia Sorenson',
 'Jimmy Cayne, Vince Tese and Pat Cayne',
 'Stuart Oran, Les Lieberman and Hilary Oran',
 'Vince Tese and Chuck Grodin',
 'Dwight Gooden and Les Lieberman',
 'Amy CunninghamBussel, Ray Mirra and , Tyler Janovitz',
 'Dan Shedrick and Samara Heafitz',
 'Cass and Jason Adelman',
 'Bart Scott and Mark Laplander',
 'Mitch Rubin, Audra Zuckerman, Michelle Smith, Kenneth Mehlman, Julia Harquail and John Ha

In [555]:
trimed='Dr. David Lyden and Patricia Sorenson'
trimed=re.sub(r'\s+of |\s+with |\s+at |\s+the |.*The',',',trimed)

trimed=re.sub(r'\s+from |\s+for ',',',trimed)
trimed=re.sub(r'\d+',',',trimed)
trimed=re.sub('\#|\$',',',trimed)
trimed=re.sub('\&|, and',' and',trimed)
#if len(caption)>100: print(caption)
#        if '"' in caption: print (caption)
trimed=re.sub('Mrs.|Mr.|Ms.|Dr.',',',trimed)
#trimed=re.sub(r'\(.*\)', '', trimed) longest match
trimed=re.sub(r'\".+?\"', '', trimed)    
trimed=re.sub('"', '', trimed)
trimed=re.sub(r'\(.+?\)', ',', trimed)        
trimed=trimed.strip(' \t\n\r')
#        trimed=re.sub(r'[^\S+]', ' ', trimed)
trimed=re.sub(r'[^A-Za-z .,'']', ' ', trimed)
trimed=trimed.strip(', .')
trimed=re.sub('\s+', ' ', trimed)

'Dr. David Lyden and Patricia Sorenson'

In [726]:
'sdsdsvvv /~~! ?? ??'.strip(' .?\"\'')

'sdsdsvvv /~~!'

In [253]:
#caption trimming

def trim_captions(list_of_captions):
    trimed_captions = []
#    list_of_captions=[x.encode('utf-8') for x in list_of_captions]
    p=0
    for caption in list_of_captions:
        if ('Click here') in caption:
            continue
        trimed=caption
        if len(caption)>250:
            continue
        if 'a friend' in caption:
            continue
        if 'Photographs by' in caption:
            continue
#        trimed=re.sub(r'\s+ofs+|\s+withs+|\s+ats+|\s+the |\s+The ',',',trimed)
#        trimed=re.sub(r'\s+from |\s+for ',',',trimed)
        trimed=re.sub(r'\d+',',',trimed)
        trimed=re.sub('\#|\$',',',trimed)
        trimed=re.sub('\&|, and',' and',trimed)
        #if len(caption)>100: print(caption)
#        if '"' in caption: print (caption)
        trimed=re.sub('Dr.',',',trimed)
#        trimed=re.sub('Mrs.|Mr.|Ms.|Dr.',',',trimed)
        #trimed=re.sub(r'\(.*\)', '', trimed) longest match
        trimed=re.sub(r'\".+?\"', '', trimed)    
        trimed=re.sub('"', '', trimed)
        trimed=re.sub(r'\(.+?\)', ',', trimed)        
        trimed=trimed.strip(' \t\n\r')
#        trimed=re.sub(r'[^\S+]', ' ', trimed)
        trimed=re.sub(r'[^A-Za-z .,'']', ' ', trimed)
        trimed=trimed.strip(', .')
        trimed=re.sub('\s+', ' ', trimed)
#        if '\(' in caption: print(caption)
#        try:
 #           if (trimed!='') & (not trimed[0].isalpha()): print(trimed)
  #      except:
#            print(trimed)
#        if not trimed.isalpha(): print(trimed)
        if trimed!='':
            trimed_captions.append(trimed)
#            mydf.loc[(p,'2')]=trimed
        p+=1
    #print (p)
    return trimed_captions
trimed_captions_all=trim_captions(captions2)
len(trimed_captions_all)
#trimed_captions_all

73791

In [128]:
#check Shafiroff
p=0
my_short_list_trimed=[]
my_short_list_called=[]
for i in trimed_captions_all:
    if 'Jean Shafiroff' in i:
        coupled=''   
        for j in names_in_line(i):
            coupled=coupled+' '+" ".join(j[:])
        my_short_list_trimed.append(i)
        my_short_list_called.append(coupled)
        p+=1
my_short_list_called

[' Cochairs Jean Shafiroff CeCe Black Ursula Lowerre Ann Van Ness Kazie Harvey Deborah Royce',
 ' Alex Donner Jean Shafiroff Geoffrey Bradfiel',
 ' Richard Ledes Jean Shafiroff Paola Miel',
 ' Jean Shafiroff Sonja Morgan Nancy Gehm',
 ' Jean Shafiroff Nancy Gehm',
 ' Anka Palitz Jean Shafiroff Lucia Hwong Gordo',
 ' Lucia Hwong Gordon Jean Shafiroff',
 ' Jean Shafiroff Prince Dmitr',
 ' Jean Shafiroff Joan Hornig Amelia Ogunles',
 ' Jean Shafiroff Erik Bottcher',
 ' Jean Shafiroff Elena Volg',
 ' Jean Shafiroff Patricia Shia',
 ' Jean Shafiroff Jana Bullock Sheila Rosenblum Jill Zar Beth Stern Trish Burke',
 ' Emily Lev Jean Shafiroff Julie Ratner',
 ' Jean Shafiroff Rita Cosby Kathy Reilly',
 ' Sharon Bus Jean Shafiroff Cornelia Sharpe Lucia Hwong Gordo Katharina OttoBernstein Liliana Cavendis',
 ' Jean Shafiroff Ike Ude',
 ' Jean Shafiroff',
 ' Martin Shafiroff Jean Shafiroff',
 ' Georgina Bloomberg Michael Bloomberg Jean Shafiroff',
 ' Pamela Morg Jean Shafiroff Arlene Lazare Wendy 

In [127]:
my_short_list_trimed

['Cochairs Jean Shafiroff, CeCe Black, Ursula Lowerre, Ann Van Ness, Kazie Harvey and Deborah Royce',
 'Alex Donner, Jean Shafiroff and Geoffrey Bradfield',
 'Richard Ledes, Jean Shafiroff and Paola Mieli',
 'Jean Shafiroff, Sonja Morgan and Nancy Gehman',
 'Jean Shafiroff and Nancy Gehman',
 'Anka Palitz, Jean Shafiroff and Lucia Hwong Gordon',
 'Lucia Hwong Gordon and Jean Shafiroff',
 'Jean Shafiroff and Prince Dmitri,Yugoslavia',
 'Jean Shafiroff, Joan Hornig and Amelia Ogunlesi',
 'Jean Shafiroff and Erik Bottcher',
 'Jean Shafiroff and Elena Volgin',
 'Jean Shafiroff and Patricia Shiah',
 'Jean Shafiroff, Jana Bullock, Sheila Rosenblum, Jill Zarin, Beth Stern and Trish Burke',
 'Emily Levin, Jean Shafiroff and Julie Ratner',
 'Jean Shafiroff, Rita Cosby and Kathy Reilly',
 'Sharon Bush, Jean Shafiroff, Cornelia Sharpe, Lucia Hwong Gordon, Katharina OttoBernstein and Liliana Cavendish',
 'Jean Shafiroff and Ike Ude',
 'Jean Shafiroff',
 'Martin Shafiroff and Jean Shafiroff',
 'Geo

In [125]:
.join(' ',('Jean', 'Shafiroff'))

SyntaxError: invalid syntax (<ipython-input-125-d105196a5a87>, line 1)

In [732]:
for i in trimed_captions_all:
    if 'friend' in i: print(i)

Leah Aden,friends
Amsale Aberra, Amelia Ogunleis,and friends
Tasha Smith and friend
Sandrine and Rod Kukurudz, Florian Boggia,Jeepers Champagne and friend
Della Rounick, Bill Sclight, Lucia Hwong Gordon and friend
Catherine Malandrino and Kelly Rutherford,friends
Chips Page and friends
Emdens and Falkenbergs and friends
Christina Rose and friends
Lisa and Tom Wilkenson and friends
Tina Flaherty and friend
du Maurier and friend
Robert Bradford, Barbara Taylor Bradford and friend
Antony Todd and friend
Barbara Gladstone and friends
Margo Nederlander and friends
Robert Matheson squires his grandmother, Ruth Buchanan and her favorite dachshund, Cinderella, across,porch to receive guests,an event,friends and family to learn more about,work,leading national nonprofit Compassion and Choices
Ruth Buchanan chats on her porch,Carolyn du Pont, a life long friend,her daughter Bonnie Matheson,their Holton Arms days
New York based artist and longtime family friends Kit Forrestal, Vivian Spencer and 

In [None]:
        trimed=re.sub(r'\b[0-9]+\b\s*', '', i)
        trimed=re.sub('\(','',trimed)
        trimed=re.sub('\)','',trimed)
        trimed=re.sub('\"','',trimed)
        trimed=re.sub('\(','',trimed)
        trimed=re.sub('\?','',trimed)
        trimed=re.sub('\n','',trimed).lstrip(' .+%=/-#$;!\(!\&=&:%;').rstrip(' .1234567890+%=-#$;!\/\(!\&=&:%;')
        trimed=re.sub('\t','',trimed).lstrip(' .+%=/-#$;!\(!\&=&:%;').rstrip('  .1234567890+%=-#$;!\/\(!\&=&:%;')

        trimed=trimed.lstrip(' and ').rstrip(' and ').rstrip(' and').lstrip('and ')
        trimed=trimed.lstrip(' with ').rstrip(' with ').rstrip(' with').lstrip('with ')
        # has photographed by
        # is long
        # and with
        start with The
        , at ends
        Ginna le Vine
        de la
        separate by numbers

In [133]:
def names_in_part(caption_part):
    list_names=[]
    p=0
    name=[]
    f=0
    caption_part=re.sub(' +', ' ', caption_part)
    caption_part= caption_part.strip(', ')
    len_cp=len(re.split(' ',caption_part))
    for i in re.split(' ',caption_part):
        if len_cp>5: break
        f=f+1
        if len(i)<2:
            if p>=2: 
                if name!=[]: list_names.append(tuple(name))
        elif (i[0].isupper()) & (not i[1].isupper()) & (f<len_cp):
            name.append(i)
            p=p+1

            continue
        elif (i[0].isupper()) & (not i[1].isupper()) & (f==len_cp)  & (p!=0):
            name.append(i)
            if p>=1: 
                if name!=[]: list_names.append(tuple(name))
            p=0
        else:
            if p>=2:
                if name!=[]: list_names.append(tuple(name))
        p=0
        name=[]
        
    return list_names
names_in_part("Fe Fendi")

[('Fe', 'Fendi')]

In [183]:
def names_in_line(cap_line):
    list_names=[]
    bad_words=['The','Zombie','Private', 'Art', 'Consultant','Executive', 'Vice', 'President',
               'Board', 'Trustee', 'Senior' ,'Partner','External', 'Relations',
                'French','Heritage', 'Society']
    for j in re.split(',',cap_line):
        if j=='': continue
        sp=[]
    #        if ("'s" in j):
    #            j=j.split("'s", 1)[0]
        for bad_word in bad_words:
            if bad_word in j: j=re.sub(bad_word, '', j)
        regex1 = re.compile(r'^\s*and\s+|^\s*with\s+|\s+and\s*$|\s+with\s*$')
        regex2 = re.compile(r'\s+and\s+|\s+with\s+|\s+and\s+|\s+with\s+')
        j=re.sub(regex1,",",j)
        j=re.sub(' +', ' ', j)
        j.lstrip(' ').rstrip(' ')
        if re.search(regex2,j):
            for sp_w in re.split(regex2,j):                

                sp.append(sp_w.rstrip(' ').lstrip(' '))
                sp=list(filter(None, sp))
            if len(sp)==1: continue
            if len(re.split(' ',sp[0]))>1:
                for m in sp:
                    list_names.extend(names_in_part(m))
            elif (len(re.split(' ',sp[0]))==1) & (len(re.split(' ',sp[1]))>1):
                m1 = sp[0:1]
                m2 = re.split(' ',sp[1])
                list_names.extend([m2])
                m1.extend(re.split(' ',sp[1])[1:])
                m1=[tuple(m1)]
                list_names.extend(m1)
        else:
            if names_in_part(j) not in list_names:
                list_names.extend(names_in_part(j))
    list_names=list(k for k,_ in itertools.groupby(list_names))

    return list_names
            
names_in_line('Fe Fendi, Jean Shafiroff and Chiuti Jansen')

[('Fe', 'Fendi'), ('Jean', 'Shafiroff'), ('Chiuti', 'Jansen')]

In [182]:
cap_line='Fe Fendi, Jean Shafiroff and Chiuti Jansen'
list_names=[]
bad_words=['The','Zombie','Private', 'Art', 'Consultant','Executive', 'Vice', 'President',
           'Board', 'Trustee', 'Senior' ,'Partner','External', 'Relations',
            'French','Heritage', 'Society']
for j in re.split(',',cap_line):
    if j=='': continue
    sp=[]
#        if ("'s" in j):
#            j=j.split("'s", 1)[0]
    for bad_word in bad_words:
        if bad_word in j: j=re.sub(bad_word, '', j)
    regex1 = re.compile(r'^\s*and\s+|^\s*with\s+|\s+and\s*$|\s+with\s*$')
    regex2 = re.compile(r'\s+and\s+|\s+with\s+|\s+and\s+|\s+with\s+')
    j=re.sub(regex1,",",j)
    j=re.sub(' +', ' ', j)
    j.lstrip(' ').rstrip(' ')
    if re.search(regex2,j):
        for sp_w in re.split(regex2,j):                
            
            sp.append(sp_w.rstrip(' ').lstrip(' '))
            sp=list(filter(None, sp))
        if len(sp)==1: continue
        if len(re.split(' ',sp[0]))>1:
            for m in sp:
                list_names.extend(names_in_part(m))
        elif (len(re.split(' ',sp[0]))==1) & (len(re.split(' ',sp[1]))>1):
            m1 = sp[0:1]
            m2 = re.split(' ',sp[1])
            list_names.extend([m2])
            m1.extend(re.split(' ',sp[1])[1:])
            m1=[tuple(m1)]
            list_names.extend(m1)
    else:
        if names_in_part(j) not in list_names:
            list_names.extend(names_in_part(j))
list_names=list(k for k,_ in itertools.groupby(list_names))

list_names


Fe Fendi
Fe Fendi
Fe Fendi
 Jean Shafiroff and Chiuti Jansen
 Jean Shafiroff and Chiuti Jansen


[('Fe', 'Fendi'), ('Jean', 'Shafiroff'), ('Chiuti', 'Jansen')]

In [179]:
list_names=[]

In [254]:
#get names from trimmed captions

def get_names_from_trimed(caption_list):
    list_names=[]
    for cap_line in caption_list:
        names_in_cap=names_in_line(cap_line)
        if names_in_cap==[]: continue
        list_names_in_cap=[]
        for name_cap in names_in_cap:
            list_names_in_cap.append(' '.join(name_cap))
        list_names.append(tuple(list_names_in_cap))
    return list_names
        
list_all_names_grouped=get_names_from_trimed(trimed_captions_all)
#list_all_names_grouped

In [255]:
list_all_names_grouped

[('Les Lieberman',
  'Barri Lieberman',
  'Isabel Kallman',
  'Trish Iervolino',
  'Ron Iervolino'),
 ('Chuck Grodin',),
 ('Diana Rosario',
  'Ali Sussman',
  'Sarah Boll',
  'Jen Zaleski',
  'Alysse Brennan',
  'Lindsay Macbeth'),
 ('Tom Murro', 'Kelly Murro'),
 ('Udo Spreitzenbarth',),
 ('Ron Iervolino', 'Trish Iervolino', 'Russ Middleton', 'Lisa Middleton'),
 ('Barbara Loughlin', 'Gerald Loughlin', 'Debbie Gelston'),
 ('Julianne Michelle',),
 ('Heather Robinson',
  'Kiwan Nichols',
  'Jimmy Nichols',
  'Melanie Carbone',
  'Nancy Brown'),
 ('Bill Mack', 'Les Lieberman'),
 ('David Lyden', 'Patricia Sorenson'),
 ('Jimmy Cayne', 'Vince Tese', 'Pat Cayne'),
 ('Stuart Oran', 'Les Lieberman', 'Hilary Oran'),
 ('Vince Tese', 'Chuck Grodin'),
 ('Dwight Gooden', 'Les Lieberman'),
 ('Amy CunninghamBussel', 'Ray Mirra', 'Tyler Janovitz'),
 ('Dan Shedrick', 'Samara Heafitz'),
 ('Jason Adelman', 'Cass Adelman'),
 ('Bart Scott', 'Mark Laplander'),
 ('Mitch Rubin',
  'Audra Zuckerman',
  'Michelle

In [1]:
import itertools  # itertools.combinations may be useful
import networkx as nx
from itertools import combinations
G = nx.Graph()
for clique in list_all_names_grouped:
    for vertices in combinations(clique, r=2):
        if vertices[0]==vertices[1]: continue
        if (G.has_edge(*vertices)):
            G.add_edge(*vertices)
            G[vertices[0]][vertices[1]]['weight'] += 1
        else:
            G.add_edge(*vertices,weight=1)
print(G.degree('Jean Shafiroff',weight='weight'))
weight_per_edge=list(G.edges(data='weight'))
d={}
for w in weight_per_edge:
    w_id=(w[0:2])
    d[w_id]=w[2]
weight_per_edge = sorted(d.items(), key=lambda kv: kv[1], reverse=True)
weight_per_edge[0:100]

NameError: name 'list_all_names_grouped' is not defined

In [25]:
import re
import itertools  # itertools.combinations may be useful
import networkx as nx
from itertools import combinations
def get_captions(path):
    captions = []
    url='http://www.newyorksocialdiary.com'+path
    page_url= requests.get(url)
    soup_url = BeautifulSoup(page_url.text, "lxml")
    soup_captions=soup_url.select('div[class*="caption"]')
#    soup_captions=soup_url.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['photocaption'])
    for tag_a in soup_captions:
        captions.append(tag_a.text)
    new_caps=[]
    for i in captions:
        trimed=re.sub(r'\b[0-9]+\b\s*', '', i)
        trimed=re.sub('\(','',trimed)
        trimed=re.sub('\)','',trimed)
        trimed=re.sub('\"','',trimed)
        trimed=re.sub('\(','',trimed)
        trimed=re.sub('\?','',trimed)
        trimed=re.sub('\n','',trimed).lstrip(' .+%=/-#$;!\(!\&=&:%;').rstrip(' .1234567890+%=/-#$;!\(!\&=&:%;')
        trimed=re.sub('\t','',trimed).lstrip(' .+%=/-#$;!\(!\&=&:%;').rstrip(' .1234567890+%=/-#$;!\(!\&=&:%;')
        trimed=re.sub('Mrs. ',' ',trimed)
        trimed=re.sub('Mr. ',' ',trimed)
        trimed=re.sub('Ms. ',' ',trimed)
        trimed=re.sub('Dr. ',' ',trimed)
        trimed=trimed.lstrip(' and ').rstrip(' and ').rstrip(' and').lstrip('and ')
        trimed=trimed.lstrip(' with ').rstrip(' with ').rstrip(' with').lstrip('with ')
        new_caps.append(re.sub(' +', ' ', trimed))
    return new_caps


In [26]:
my_caps_test=get_captions('/party-pictures/2010/we-are-family')
my_caps_test


['The scene at Cartier for a kick-off party for the French Heritage Society',
 'Barbara Regna, Kirk Ressler, Diana Quasha, and Peter Reg',
 'Blakely Griggs and Cindy Ketchum',
 'Christopher Spitzmiller',
 'Michael and Brent Winsto',
 'Doug Steinbrech, Amy Hoadley, and Mark Gilbertso',
 'Kathy Prounis and John Truex',
 'Richard Lambertson and John Truex',
 'Margot Takian and Mary Van Pel',
 'Caroline Coleman and Jaime Heijm',
 'Frederica and Leigh Lauder',
 'Kathy Prounis and Jaime Heijm',
 'Vicky Ward, Mary Snow, and Whitney Douglass',
 'Charles and McDowell W',
 'Ian Snow and Whitney Douglass',
 'Jackie and Chris Keber',
 'Harry Davison and Marilyn Farham',
 'Sue Chalom and Geoffrey Bradfiel',
 'Katie and Michael Evans',
 'Melanie Holland, Amy Hoadley, and Marina Killery',
 'Mercedes Desio and Alberto Villalobos',
 'Lorne Weil, Kathy Angele, and Farah and Karl Roessner',
 'Luke Parker Bowles and Jeff Sharp',
 'P. Allen Smith and Lindsey Harper',
 'Valerie Lettan, Gloria Fieldcamp, Fra

This should get the same captions as before.

In [None]:
assert captions == get_captions("/party-pictures/2015/celebrating-the-neighborhood")

In [298]:
headers


['\n\n\n\n',
 'The scene at   IDEAL School & Academy’s  10th\xa0Annual Gala.',
 "Jon Batiste and Marcus Miller at The NAACP Legal Defense and Educational Fund's  28th annual National Equal Justice Award Dinner.",
 'Fujiko Nakaya\'s "Veil" envelops the  Glass House.',
 "The Society of Memorial Sloan Kettering’s Associates Committee's  annual Fall Party at the Four Seasons Restaurant. ",
 "The scene at Michael's 25th anniversary celebration.",
 'Lifeline New York Board members',
 "The scene at Ovarian Cancer Research Fund's 20th Anniversary Legends Gala, hosted by Harry Connick Jr.",
 "The Horticultural Society of New York's  21st Annual Fall Luncheon the Metropolitan Club.",
 "The scene in the Wade Thompson Drill Hall  of the Park Avenue Armory for Park Avenue Armory's 2014 Gala Masquerade.",
 "Melanie Holland, Catherine Carey, Dee Dee Taylor Eustace, Clelia Peters, Michael Boodro, Barbara Friedmann, Frederick Warburg Peters, Jim Bunn, Karen Klopp, and Martha Glass at  The Society of Me

Parsing names:

In [None]:
"""
def names_in_part(caption_part):
    list_names=[]
    p=0
    name=[]
    f=0
    caption_part=re.sub(' +', ' ', caption_part)
    caption_part=caption_part.lstrip(' ').rstrip(' ')
    for i in re.split(' ',caption_part):
        if len(re.split(' ',caption_part))>5: break
        f=f+1
        if len(i)<2:
            continue
        elif (i[0].isupper()) & (i[1].islower()) & (f<len(re.split(' ',caption_part))):
            name.append(i)
            p=p+1
        elif (i[0].isupper()) & (i[1].islower()) & (f==len(re.split(' ',caption_part)))  & (p!=0):
            name.append(i)
            if p<=2: list_names.append(tuple(name))
            p=0
            name=[]
        elif p<=1:
            p=0
            name=[]
            continue
        else:
            if p<=2:
                list_names.append(tuple(name))
            p=0
            name=[]
            continue
    return list_names
names_in_part(" Pe)ter Kross")
"""

In [667]:
def names_in_part(caption_part):
    list_names=[]
    p=0
    name=[]
    f=0
    caption_part=re.sub(' +', ' ', caption_part)
    caption_part= caption_part.strip(', ')
    for i in re.split(' ',caption_part):
        if len(re.split(' ',caption_part))>5: break
        f=f+1
        if len(i)<2:
            continue
        elif (i[0].isupper()) & (not i[1].isupper()) & (f<len(re.split(' ',caption_part))):
            name.append(i)
            p=p+1
        elif (i[0].isupper()) & (not i[1].isupper()) & (f==len(re.split(' ',caption_part)))  & (p!=0):
            name.append(i)
            if p<=2:
                list_names.append(tuple(name))
                p=0
                name=[]
        elif p<=1:
            p=0
            name=[]
            continue
        else:
            if p<=2:
                list_names.append(tuple(name))
                p=0
                name=[]
            continue
    return list_names
names_in_part("ohkjgjh Hlkjh Kjhhg jjg Khgjh Igj")

[]

In [None]:
#[tuple(xi for xi in x if xi is not None) for x in [('Bill', 'Hamilto'), ('Peter', 'Kross'), ('Peter', 'Kross')]]
#list(k for k,_ in itertools.groupby([('Bill', 'Hamilto'), ('Peter', 'Kross'), ('Peter', 'Kross')]))

In [29]:
path_list=[]
for i in link_list:
    path_list.append(i[0])
len(path_list)


1193

In [None]:
def get_names_page(path_list):
    list_names=[]
    for path in path_list:
        caption = get_captions(path)
        for cap_line in caption:
            list_names.extend(names_in_line(cap_line))
    return list_names
        
#list_all_names=get_names_page(path_list)

In [213]:
def get_names_page_by_caption(path_list):
    list_names=[]
    for path in path_list:
        caption = get_captions(path)
        for cap_line in caption:
            names_in_cap=names_in_line(cap_line)
            if names_in_cap==[]: continue
            list_names_in_cap=[]
            for name_cap in names_in_cap:
                list_names_in_cap.append(' '.join(name_cap))
            list_names.append(tuple(list_names_in_cap))
    return list_names
        
list_all_names_grouped=get_names_page_by_caption('/party-pictures/2015/celebrating-the-neighborhood')

NameError: name 'get_captions' is not defined

In [None]:
#def get_names_page_by_caption(path_list):
#    list_names=[]
#    p=0
#    for path in path_list:
#        p=p+1
#        caption = get_captions(path)
#        for cap_line in caption:
#            names_in_cap=names_in_line(cap_line)
#            list_names_in_cap=[]
#            for name_cap in names_in_cap:
#                for name_cap in names_in_cap:                    
#                    list_names_in_cap.append(' '.join(name_cap))
#            list_names.append(tuple(list_names_in_cap+[p]))
#                
#    return list_names
#        
#list_all_names_grouped=get_names_page_by_caption(path_list)
#list_all_names_grouped=[tuple(xi for xi in x if xi is not None) for x in list_all_names_grouped]

In [None]:
#list_all_names_grouped=[tuple(xi for xi in x if xi is not None) for x in list_all_names_grouped]

In [None]:
dill.dump(list_all_names, open('list_all_names.pkd', 'wb'))

In [119]:

dill.dump(list_all_names_grouped, open('list_all_names_grouped.pkd', 'wb'))

In [None]:
list_all_names_grouped = dill.load(open('list_all_names_grouped.pkd', 'rb'))

In [199]:
from requests_futures.sessions import FuturesSession

In [242]:

        
list_all_names_grouped=get_names_from_trimed(trimed_captions_all)
#list_all_names_grouped


d1={}

for w in first_page_names:
    w=' '.join(w)
    d1[w]=d1.get(w, 0)+1
sorted_d1 = sorted(d1.keys(), key=lambda kv: kv[0], reverse=False)
sorted_d1

[]

In [249]:
d1={}

for w in list_all_names_grouped:
    group=''
    for i in w:
        d1[i]=d1.get(i, 0)+1
sorted_d1 = sorted(d1.keys(), key=lambda kv: kv[0], reverse=False)
sorted_d1

['Amory McAndrew',
 'Angela Clofine',
 'Alex Papachristidis',
 'Alberto Villalobos',
 'Andi Potamkin',
 'Alexandra Richards',
 'Anne Strickland',
 'Allison Minton',
 'Alexis Mersentes',
 'Barbara Tober',
 'Bjorn Wallander',
 'Bryna Pomp',
 'Barbara Regna',
 'Bette Midler',
 'Christopher Spitzmiller',
 'Chele Farley',
 'Caroline Dean',
 'Craig Leavitt',
 'Celia Morrissette',
 'Caroline Cokley',
 'Cynthia Adler',
 'C. Virginia Fields',
 'Clark Munnell',
 'Chips Page',
 'Diana Quasha',
 'David Duncan',
 'Debbie Bancroft',
 'David Svanda',
 'Doug Steinbrech',
 'Dennis Freedman',
 'Donald Tober',
 'Deborah Lloyd',
 'Deborah Marton',
 'David Rockwell',
 'Davon Windsor',
 'Dan Tishman',
 'Darcy Stacom',
 'Douglas Little',
 'Donna Soloway',
 'Doris Liebman',
 'Elijah DuckworthSchachter',
 'Elizabeth Swartz',
 'Ellen Niven',
 'Emily Leonard',
 'Ebony G. Patterson',
 'Esra Munnell',
 'Elizabeth Johnson',
 'Eveyln Subramaniam',
 'Freddie Leiba',
 'Geoffrey Bradfield',
 'Glenn Adamson',
 'Gina Nan

In [248]:
i

'Doris Liebman'

In [227]:
len(list_all_names)
d={}

for w in list_all_names:
    w=' '.join(w)
    d[w]=d.get(w, 0)+1
sorted_d = sorted(d.keys(), key=lambda kv: kv[0], reverse=False)
sorted_d_key= sorted(d.items(), key=lambda kv: kv[1], reverse=True)
sorted_d_key

NameError: name 'list_all_names' is not defined

In [228]:
list_all_names_grouped

[('Randy Takian',),
 ('Kamie Lightburn', 'Christopher Spitzmiller'),
 ('Christopher Spitzmiller', 'Diana Quasha'),
 ('Mariam Azarm', 'Sana Sabbagh', 'Lynette Dallas'),
 ('Christopher Spitzmiller', 'Sydney Shuman', 'Matthew Bees'),
 ('Christopher Spitzmiller', 'Tom Edelman'),
 ('Warren Scharf', 'Sydney Shuman'),
 ('Amory McAndrew', 'Sean McAndrew'),
 ('Sydney Shuman', 'Mario Buatta', 'Helene Tilney'),
 ('Katherine DeConti', 'Elijah DuckworthSchachter'),
 ('John Rosselli', 'Elizabeth Swartz'),
 ('Stephen Simcock', 'Lee Strock', 'Thomas Hammer'),
 ('Richard Lightburn', 'Michel Witmer'),
 ('Jennifer Cacioppo', 'Kevin Michael Barba'),
 ('Virginia Wilbanks', 'Lacary Sharpe'),
 ('Valentin Hernandez', 'Yaz Hernandez', 'Chele Farley', 'James Farley'),
 ('Harry Heissmann', 'Angela Clofine', 'Michael Clofine'),
 ('Jared Goss', 'Kristina Stewart Ward'),
 ('Alex Papachristidis', 'Mario Buatta'),
 ('Nick Olsen', 'Lindsey Coral Harper', 'Alberto Villalobos', 'David Duncan'),
 ('Caroline Dean', 'Chris

Find pairs:

In [208]:
import itertools  # itertools.combinations may be useful
import networkx as nx
from itertools import combinations

G.clear()
G = nx.Graph()

for clique in list_all_names_grouped:
    for vertices in combinations(clique, r=2):
        if vertices[0]==vertices[1]: continue
        if (G.has_edge(*vertices)):
            G[vertices[0]][vertices[1]]['weight'] += 1
        else:
            G.add_edge(*vertices,weight=1)

In [209]:
#G.edges(data='weight')
print(G.degree('Jean Shafiroff',weight='weight'))

[]


In [257]:
edges_per_node=[]
for i in G:
#    edges_per_node.append(tuple([i,2*len(G.edges(i))]))
     edges_per_node.append(tuple([i,G.degree(i,weight='weight')]))
d={}
for w in edges_per_node:
    d[w[0]]=w[1]
edges_per_node_sorted = sorted(d.items(), key=lambda kv: kv[1], reverse=True)
edges_per_node_sorted[0:100]


[('Jean Shafiroff', 468),
 ('Mark Gilbertson', 361),
 ('Gillian Miniter', 354),
 ('Alexandra Lebenthal', 279),
 ('Geoffrey Bradfield', 267),
 ('Somers Farkas', 235),
 ('Yaz Hernandez', 213),
 ('Andrew Saffir', 208),
 ('Debbie Bancroft', 205),
 ('Kamie Lightburn', 199),
 ('Eleanora Kennedy', 197),
 ('Alina Cho', 197),
 ('Sharon Bush', 188),
 ('Jamee Gregory', 181),
 ('Lydia Fenet', 173),
 ('Bonnie Comley', 171),
 ('Mario Buatta', 170),
 ('Lucia Hwong Gordon', 166),
 ('Allison Aston', 164),
 ('Muffie Potter Aston', 162),
 ('Patrick McMullan', 151),
 ('Stewart Lane', 147),
 ('Deborah Norville', 145),
 ('Bettina Zilkha', 143),
 ('Karen LeFrak', 142),
 ('Barbara Tober', 141),
 ('Audrey Gruss', 136),
 ('Martha Stewart', 134),
 ('Grace Meigher', 132),
 ('Daniel Benedict', 130),
 ('Liz Peek', 129),
 ('Roric Tobin', 128),
 ('Adelina Wong Ettelson', 127),
 ('Diana Taylor', 126),
 ('Gregory Long', 126),
 ('Rosanna Scotto', 126),
 ('Kipton Cronkite', 125),
 ('Nicole Miller', 123),
 ('Fe Fendi', 12

In [260]:
pr = nx.pagerank(G, alpha=0.85)
pagerank_per_node_sorted = sorted(pr.items(), key=lambda kv: kv[1], reverse=True)
pagerank_per_node_sorted[0:100]

[('Jean Shafiroff', 0.0007020864722412184),
 ('Mark Gilbertson', 0.0005736198498324358),
 ('Gillian Miniter', 0.00048365833382207424),
 ('Geoffrey Bradfield', 0.00042423715021890133),
 ('Alexandra Lebenthal', 0.00041870645245864626),
 ('Andrew Saffir', 0.0003574561315646781),
 ('Somers Farkas', 0.00034659298412262243),
 ('Debbie Bancroft', 0.00032188929214430376),
 ('Kamie Lightburn', 0.0003218840904881546),
 ('Sharon Bush', 0.00031757092049361176),
 ('Yaz Hernandez', 0.0003174395640984259),
 ('Alina Cho', 0.0003127404469189678),
 ('Mario Buatta', 0.0003114961709569636),
 ('Lydia Fenet', 0.00027634627105305496),
 ('Eleanora Kennedy', 0.0002723656230131856),
 ('Lucia Hwong Gordon', 0.0002722913531306792),
 ('Patrick McMullan', 0.00026683161577304554),
 ('Barbara Tober', 0.00024896655700153207),
 ('Allison Aston', 0.0002441946061226163),
 ('Martha Stewart', 0.00023989092259561084),
 ('Jamee Gregory', 0.00023970770657321362),
 ('Kipton Cronkite', 0.00023656809345816464),
 ('Bonnie Comley'

Find best friends:

In [None]:
import itertools  # itertools.combinations may be useful
import networkx as nx
from itertools import combinations

G = nx.Graph()
d={}
for clique in list_all_names_grouped:
    p_new=clique[len(clique)-1]
    for vertices in combinations(clique[:len(clique)-1], r=2):
        pair_id=' '.join(vertices)
        if vertices[0]==vertices[1]: continue
        if pair_id not in d: d[pair_id]=[0,0]
        if (G.has_edge(*vertices)) & (p_new!=d[pair_id][1]):
            G[vertices[0]][vertices[1]]['weight'] += 1
            d[pair_id][0] +=1
            d[pair_id][1]=p_new
        elif (G.has_edge(*vertices)) & (p_new==p_old):
            continue
        else:
            d[pair_id][0] +=1
            d[pair_id][1]=p_new
            G.add_edge(*vertices,weight=1)


In [283]:


G = nx.Graph()

for clique in list_all_names_grouped:
    for vertices in combinations(clique, r=2):
        if vertices[0]==vertices[1]: continue
        if (G.has_edge(*vertices)):
            G[vertices[0]][vertices[1]]['weight'] += 1
        else:
            G.add_edge(*vertices,weight=1)

368


[(('Gillian Miniter', 'Sylvester Miniter'), 72),
 (('Bonnie Comley', 'Stewart Lane'), 53),
 (('Jamee Gregory', 'Peter Gregory'), 53),
 (('Daniel Benedic', 'Andrew Saffir'), 43),
 (('Barbara Tober', 'Donald Tober'), 37),
 (('Jean Shafiroff', 'Martin Shafiroff'), 32),
 (('Grace Meigher', 'Chris Meigher'), 31),
 (('Chappy Morris', 'Melissa Morris'), 30),
 (('Yaz Hernandez', 'Valentin Hernandez'), 25),
 (('Peter Reg', 'Barbara Reg'), 24),
 (('Deborah Norville', 'Karl Wellner'), 24),
 (('Hilary Geary Ross', 'Wilbur Ross'), 24),
 (('Alexandra Lebenthal', 'Jay Diamo'), 23),
 (('Olivia Palermo', 'Johannes Huebl'), 23),
 (('Coco Kopelm', 'Arie Kopelm'), 22),
 (('David Koc', 'Julia Koc'), 22),
 (('R. Couri Hay', 'Janna Bullock'), 20),
 (('Tommy Hilfiger', 'Dee Ocleppo'), 20),
 (('Geoffrey Bradfiel', 'Roric Tob'), 19),
 (('Eleanora Kennedy', 'Michael Kennedy'), 19),
 (('Nina Griscom', 'Leonel Piraino'), 19),
 (('Jay McInerney', 'Anne Hearst McInerney'), 18),
 (('Fernanda Kellogg', 'Kirk Henckels'

In [105]:
d

9

In [57]:
import statistics
statistics.stdev(weight_per_edge_data[0:100])

10.136941144032344

In [618]:
#G.edges(data='weight')
#G.edges
weight_per_edge=[]
for i in G.edges():
    weight_per_edge.append((' '.join(i),G[i[0]][i[1]]['weight']))
d1={}
for w in weight_per_edge:
    d1[w[0]]=w[1]
weight_per_edge_sorted = sorted(d1.items(), key=lambda kv: kv[1], reverse=True)
weight_per_edge_sorted[0:100]

[('Gillian Miniter Sylvester Miniter', 75),
 ('Bonnie Comley Stewart Lane', 56),
 ('Jamee Gregory Peter Gregory', 56),
 ('Daniel Benedic Andrew Saffir', 43),
 ('Jonathan Farkas Somers Farkas', 41),
 ('Barbara Tober Donald Tober', 39),
 ('Jean Shafiroff Martin Shafiroff', 35),
 ('Chappy Morris Melissa Morris', 33),
 ('Grace Meigher Chris Meigher', 32),
 ('John Catsimatidis Margo Catsimatidis', 30),
 ('Barbara Reg Peter Reg', 26),
 ('Deborah Norville Karl Wellner', 26),
 ('Alexandra Lebenthal Jay Diamo', 26),
 ('Yaz Hernandez Valentin Hernandez', 25),
 ('Hilary Geary Ross Wilbur Ross', 25),
 ('David Koc Julia Koc', 25),
 ('Eleanora Kennedy Michael Kennedy', 24),
 ('Coco Kopelm Arie Kopelm', 24),
 ('Campion Pla Tatiana Pla', 23),
 ('Olivia Palermo Johannes Huebl', 23),
 ('Geoffrey Bradfiel Roric Tob', 22),
 ('R. Couri Hay Janna Bullock', 22),
 ('Jay McInerney Anne Hearst McInerney', 22),
 ('Tommy Hilfiger Dee Ocleppo', 21),
 ('Nina Griscom Leonel Piraino', 20),
 ('Wilbur Ross Hilary Ross'

In [117]:
weight_per_edge=[]
for i in G.edges():
    weight_per_edge.append((' '.join(i),G[i[0]][i[1]]['weight']))
d1={}
for w in weight_per_edge:
    d1[w[0]]=w[1]
weight_per_edge_sorted = sorted(d1.items(), key=lambda kv: kv[1], reverse=True)
weight_per_edge_sorted

[('Gillian Miniter Sylvester Miniter', 71),
 ('Bonnie Comley Stewart Lane', 51),
 ('Jamee Gregory Peter Gregory', 49),
 ('Daniel Benedic Andrew Saffir', 42),
 ('Jonathan Farkas Somers Farkas', 35),
 ('Jean Shafiroff Martin Shafiroff', 33),
 ('Geoffrey Bradfiel Roric Tob', 32),
 ('Barbara Tober Donald Tober', 32),
 ('Grace Meigher Chris Meigher', 28),
 ('Chappy Morris Melissa Morris', 28),
 ('John Catsimatidis Margo Catsimatidis', 28),
 ('Peter Reg Barbara Reg', 26),
 ('Deborah Norville Karl Wellner', 26),
 ('Alexandra Lebenthal Jay Diamo', 26),
 ('Yaz Hernandez Valentin Hernandez', 25),
 ('Hilary Geary Ross Wilbur Ross', 25),
 ('Coco Kopelm Arie Kopelm', 24),
 ('David Koc Julia Koc', 24),
 ('Lizzie Tisc Jonathan Tisc', 23),
 ('Olivia Palermo Johannes Huebl', 23),
 ('R. Couri Hay Janna Bullock', 22),
 ('Jay McInerney Anne Hearst McInerney', 21),
 ('Eleanora Kennedy Michael Kennedy', 20),
 ('Tommy Hilfiger Dee Ocleppo', 20),
 ('Mark Badgley James Mischk', 19),
 ('Guy Robinso Elizabeth St

In [None]:
#test

path_test =['/party-pictures/2015/celebrating-the-neighborhood'] 
list_all_names_test=get_names_page_by_caption(path_test)
list_all_names_test=[tuple(xi for xi in x if xi is not None) for x in list_all_names_test]
