In [193]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import numpy as np
import urllib.parse as urlparse
import re
import spacy
nlp = spacy.load('en')

### Extracting event information from one page.

This is part of my code for a project that involved gathering/scraping data from a website.

You can read about the whole project here: https://showmikpodder.github.io/Peapod/socialite-connections.html

I suggest you read along with that link as you go through the code to get an idea of what is trying to being accomplished. 

This is just the extraction bit of the project.


In [2]:
# url of webpage that contains all links to events. 
url = 'http://www.newyorksocialdiary.com/party-pictures'

In [39]:
# there are multiple pages of events, 
# to just extract from just the first page, 
# make a page variable for use in requests method, 
# set to 0 for first page extraction (response request object)
page = 0 

# store the raw html code in events_on_page variable, 
# use bs4 prettify to make legible. 

events_on_page = requests.get(url, params={"page":""})
soup = BeautifulSoup(events_on_page.text, "lxml")

pretty_soup = soup.prettify()

# to give an idea of what the middle of the html looks like
print(pretty_soup[10000:15000])

iv class="email-subscribe">
       <form action="http://members.newyorksocialdiary.com/subscribe/subscribe.tml" method="POST">
        <input class="email" name="email" onblur="if(this.value == '') { this.value = 'Enter your email'; }" onfocus="if(this.value == 'Enter your email') { this.value = ''; }" type="text" value="Enter your email"/>
        <input class="subscribe" name="subscribe" type="submit" value="GO"/>
        <input name="list" type="hidden" value="nysdsubscribers"/>
        <input name="confirm" type="hidden" value="none"/>
        <input name="showconfirm" type="hidden" value="F"/>
        <input name="url" type="hidden" value="http://www.newyorksocialdiary.com"/>
        <input name="appendsubinfotourl" type="hidden" value="T"/>
       </form>
      </div>
     </div>
     <div class="panel-pane pane-ad pane-12 adverserve-ad advertserve-12">
      <!-- BEGIN ADVERTPRO CODE BLOCK -->
      <script type="text/javascript">
       document.write('<scr' + 'ipt src="http://

In [15]:
# party event information live in <div class=”view-rows”> tags
# bs4 find_all finds all and stores in links variable.
links = soup.find_all('div', attrs={'class':'views-row'})

print("Total events in the first page: ", len(links))

links[:1]  #what the first couple elements of links looks like

Total events in the first page:  50


[<div class="views-row views-row-1 views-row-odd views-row-first">
 <span class="views-field views-field-title"> <span class="field-content"><a href="/party-pictures/2017/light-years-ahead">Light-years Ahead</a></span> </span>
 <span class="views-field views-field-created"> <span class="field-content">Tuesday, November 28, 2017</span> </span> </div>]

In [35]:
# function for extracting 
# name of the event, link text and date of the event
# built to go through a specified list element in the links list above

def get_link_date(el):

    el_title = links[el].select('a')[0]
    url = el_title['href']
    date = links[el].select('span')[3]
    
    return el_title.text, url, datetime.strptime(str(date.text), '%A, %B %d, %Y')

In [36]:
# passing the first element of links list to get_link_date function.
# returns the party event name, link, 
# and a datetime object of when the event occured

get_link_date(0)

('Light-years Ahead',
 '/party-pictures/2017/light-years-ahead',
 datetime.datetime(2017, 11, 28, 0, 0))

### Building a Function that extracts from all the pages on the event list

In [42]:
# event_info variable will hold events information
event_info = []

def get_links(page):

    params = {"page":page}
    response = requests.get('http://www.newyorksocialdiary.com/party-pictures', params= params)
    soup = BeautifulSoup(response.text, "lxml")
    links = soup.find_all('div', attrs={'class':'views-row'})
    
    for i in range(len(links)):
        title = links[i].select('a')[0]
        url = title['href']
        date = links[i].select('span')[3]
        y = datetime.strptime(str(date.text), '%A, %B %d, %Y')
        x = (title.text, url, y) 
        event_info.append(x)
    return event_info


In [43]:
# recreated empty event_info variable
event_info = []

# noticed that there were 30 pages of events on the event pages list.
# so will need to go from page 0 to 30 in response requests object

for i in range(30):
    get_links(i)


In [48]:
print("Number of events collected: ",len(event_info))

# first couple events and their info tuples.
print(event_info[:5])

Number of events collected:  1495
[('Light-years Ahead', '/party-pictures/2017/light-years-ahead', datetime.datetime(2017, 11, 28, 0, 0)), ('The Spirit of Collaboration', '/party-pictures/2017/the-spirit-of-collaboration', datetime.datetime(2017, 11, 22, 0, 0)), ('Blerancourt awards', '/party-pictures/2017/blerancourt-awards', datetime.datetime(2017, 11, 21, 0, 0)), ('Something Personal', '/party-pictures/2017/something-personal', datetime.datetime(2017, 11, 20, 0, 0)), ('Game changers', '/party-pictures/2017/game-changers', datetime.datetime(2017, 11, 17, 0, 0))]


In [75]:
# creating a cutoff point (date filter)
# only want parties from before 12/1/14

filtered_event_info = [x for x in event_info if x[2] <= datetime(2014, 12, 1, 0, 0)]

In [80]:
print("Number of events after the date filter: ",len(filtered_event_info))

Number of events after the date filter:  1193


### Understanding which months had the most events

In [82]:
# creating a pandas dataframe out of the list of tuples (filtered_event_info).
df1 = pd.DataFrame(filtered_event_info, columns=['event','url', 'date'])

# adjusting the dates to be "Month-Year"
df1['date'] = df1['date'].dt.strftime('%b-%Y')

# counting the number of occurrences of each "Month-Year" combination
# essentially a count of all the events in each month.
df2 = df1['date'].value_counts()


datetime64[ns]


In [91]:
# what the first elements of df1 looks like.
df1.head()

Unnamed: 0,event,url,date
0,The Thanksgiving Day Parade from the ground up...,/party-pictures/2014/the-thanksgiving-day-para...,Dec-2014
1,Gala Guests,/party-pictures/2014/gala-guests,Nov-2014
2,Equal Justice,/party-pictures/2014/equal-justice,Nov-2014
3,Celebrating the Treasures,/party-pictures/2014/celebrating-the-treasures,Nov-2014
4,Associates and Friends,/party-pictures/2014/associates-and-friends,Nov-2014


In [93]:
# what the first elements of df2 looks like.
df2.head()

May-2007    20
Jun-2010    19
Oct-2008    19
Jul-2010    18
Jul-2009    18
Name: date, dtype: int64

In [119]:
# conversion of df2 into a list of tuples
popular_months = list(zip(list(df2.index),list(df2)))

In [121]:
# top 10 most popular months for events
popular_months[:10]

[('May-2007', 20),
 ('Jun-2010', 19),
 ('Oct-2008', 19),
 ('Jul-2010', 18),
 ('Jul-2009', 18),
 ('Aug-2010', 18),
 ('May-2010', 18),
 ('Mar-2007', 18),
 ('Mar-2010', 17),
 ('May-2008', 17)]

### Extracting captions from one event page

In [122]:
# testing code structure on a single event page before trying to extract from all the events

# test "Celebrating the neighborhood" event url

# using requests/bs4 to extract html code from web page.
url_html = requests.get('http://www.newyorksocialdiary.com/party-pictures/2015/celebrating-the-neighborhood')
soup1 = BeautifulSoup(url_html.text, "lxml")

# after inspection of html, noticed names of individuals in photos 
# are contained in <div class='photocaption'> tags for this particular page
captions = soup1.find_all('div', attrs={'class':'photocaption'})

In [124]:
# first couple captions on the page
captions[:3]

[<div align="center" class="photocaption">Glenn Adamson, Simon Doonan, Victoire de Castellane, Craig Leavitt, Jerome Chazen, Andi Potamkin, Ralph Pucci, Kirsten Bailey, Edwin Hathaway, and Dennis Freedman at the Museum of Art and Design's annual MAD BALL. </div>,
 <div align="center" class="photocaption"> Randy Takian </div>,
 <div align="center" class="photocaption"> Kamie Lightburn and Christopher Spitzmiller </div>]

### Extracting all the captions from ALL the event pages

In [134]:
# isolating the urls from filtered_event_info list of tuples
# unzipping the tuples into component list of lists
x = list(zip(*filtered_event_info))

# list of urls is contained in second list, x[1]
event_urls = x[1]
print("Total number of event urls is ",len(event_urls))

print(event_urls[:3])

Total number of event urls is  1193
('/party-pictures/2014/the-thanksgiving-day-parade-from-the-ground-up', '/party-pictures/2014/gala-guests', '/party-pictures/2014/equal-justice')


In [139]:
# using urlparse to combine the partial urls above to a make a full url
full_path_test = urlparse.urljoin('http://www.newyorksocialdiary.com/', event_urls[0])

print(full_path_test)

http://www.newyorksocialdiary.com/party-pictures/2014/the-thanksgiving-day-parade-from-the-ground-up


In [140]:
# building a function that can iterate over all the event urls 
# and combines everything from above

# empty list that will hold raw captions that are extracted
raw_captions = []

def get_captions(path):
    full_path = urlparse.urljoin('http://www.newyorksocialdiary.com/', path)
    url_html = requests.get(full_path) #requests full html page of requested page
    url_html.text #returns the html text
    soup1 = BeautifulSoup(url_html.text, "lxml") #turns the html text into a soup object
    
    #after much trial and error found the captions in older event pages found in different html tags
    x = soup1.find_all('div', attrs={'class':'photocaption'})
    y = soup1.find_all('font', attrs={'face':'Verdana, Arial, Helvetica, sans-serif'})
    z = soup1.find_all('td', attrs={'class':'photocaption'})
    messy_captions = x + y + z
    
    for i in range(len(messy_captions)): #loops through captions list and returns the text
        raw_captions.append(messy_captions[i].text)#inputs the text into raw_captions list
    return raw_captions 

In [141]:
# iterating over entire captions list and running get_captions function on each element.
# can take up to 10 minutes
raw_captions = []
for i in event_urls:
    get_captions(i)

In [143]:
print("The total number of captions captured: ",len(raw_captions))

The total number of captions captured:  131079


In [149]:
# there are some names and text in some of the captions
raw_captions[230:245]

[' Eleanor Noell ',
 ' Melissa Errico ',
 ' Outside the tent ',
 ' Under the tent',
 ' Jenny Price, Sharon Jacob, Stephanie Shuman, Gillian Miniter, and Deborah Roberts ',
 ' Cynthia Conway, Tom Kempner, Luann Blowers, and David Blowers ',
 ' Michael  and Jenny Price ',
 ' Bob  and Suzanne Cochran ',
 ' Patsy and Jeff Tarr ',
 ' Fred and Stephanie Shuman ',
 ' Gillian and Sylvester Miniter ',
 ' Iffie Okoronkwo Aitkenhead, Agenia Clark, Deborah Roberts, and Fiona Rudin ',
 ' Michael  and Lise Evans ',
 ' Fe  and Alessandro Fendi ',
 ' Jill Ross, Sharon Teles, and Eby McKay ']

In [150]:
# there are also lots of needless spaces
raw_captions[:5]

['\n\n\n\n', '', '\n\n\n\n', '', '\n\n\n\n']

### Cleaning the captions and extracting names out from them

Should have done so earlier but this is a good point to save what we have extracted in a pickle file

import ediblepickle

dill.dump(raw_captions, open('nysd-captions.pkd', 'w'))

raw_captions = dill.load(open('nysd-captions.pkd', 'r'))

In [158]:
# creating a dummy variable that i can work on
str_raw_captions = raw_captions

In [160]:
# some cleanup on the captions. removal of exess spaces and new lines and empty captions
for i in range(len(str_raw_captions)):
    str_raw_captions[i] = str_raw_captions[i].strip()
    str_raw_captions[i] = str_raw_captions[i].replace('\n', "") #replaces '\n' character
    str_raw_captions[i] = re.sub(r'\s+', ' ', str_raw_captions[i])

In [165]:
# removal of photographer captions
str_raw_captions = [caption for caption in str_raw_captions if not re.search(r'^Photographs by ',caption)]

In [170]:
# removal of empty elements
str_raw_captions = list(filter(None, str_raw_captions))

In [172]:
print('After some cleaning the list of captions now has ', len(str_raw_captions), 'elements')

After some cleaning the list of captions now has  108646 elements


This list of captions still contains honorifics such as Dr., Mr, Sir, etc


In [174]:
str_raw_captions[:10]

['The scene at IDEAL School & Academy’s 10th Annual Gala.',
 'Les Lieberman, Barri Lieberman, Isabel Kallman, Trish Iervolino, and Ron Iervolino',
 'Chuck Grodin',
 'Diana Rosario, Ali Sussman, Sarah Boll, Jen Zaleski, Alysse Brennan, and Lindsay Macbeth',
 'Kelly and Tom Murro',
 'Udo Spreitzenbarth',
 'Ron Iervolino, Trish Iervolino, Russ Middleton, and Lisa Middleton',
 'Barbara Loughlin, Dr. Gerald Loughlin, and Debbie Gelston',
 'Julianne Michelle',
 'Heather Robinson, Kiwan Nichols, Jimmy Nichols, Melanie Carbone, and Nancy Brown']

In [173]:
# honorifics cleaning, list of titles was created after making observations

# empty list where cleaned captions with no honorifics will be kept
nohonorifics_cleancaptions = []

def junk_title_cleaner(unclean_caption):
    h1=['Mr. ','Guest','U.S. Representative ',' M.D.', ' M.D.,','PhD','Ph.D.',' Jr.',' Sr.','Mrs. ','Miss ','Doctor ','Dr. ','Dr ','Chair ','CEO ','the Honorable ','Mayor ','Prince ','Baroness ', 'Princess ', 'Honorees ', 'Honoree',' MD']
    h3=['De', 'Highness ','Museum President ','Chief Curator ','Frick Director ','left ','right ','honoree ','de ','host ','dressed ','Police Commissioner ','Music Director ','Frick Trustee ','Historic Hudson Valley Trustee ', 'Museum President ','Public Theater Artistic Director ','Public Theater Executive Director ','Executive Director ','Cooper Union President ','The Hon. ','Dancing Chair ','Director Emerita ']
    h2=['Hon. ','Lord ','Senator ','Deputy ','Director ','Dean ','Actor ','Actress ',' Esq.', 'Gov ','Governor ','Father ','Congresswoman ','Congressman ', 'Countess ','Awardee ','Chairman ','Commissioner ','Lady ','Ambassador ','President ','CEO ', ' von', ' van']
    hwords=h1 + h2 + h3
    honorifics = '|'.join(list(set(hwords))) ###Mr. and Mrs. couples have been ignored. this maybe problematic later on.
    x = unclean_caption.replace(", and "," and ") #Replaces the ',/sand' with '/sand/s'
    x = re.sub(honorifics, '', x)  #Replaces honorifics with empty spaces
    #if x[-1] == ",":
    #    x = list(x)
    #    x[-1] =''
    #    x = ''.join(x)
        
    x = x.replace(", and "," and ")
    notitles_clean_captions = x #cleaned of all the junk titles and such
    nohonorifics_cleancaptions.append(notitles_clean_captions)
    return nohonorifics_cleancaptions

In [177]:
nohonorifics_cleancaptions = []

# iterates through every caption and runs through the cleaner
for i in str_raw_captions:
    junk_title_cleaner(i)
 

In [179]:
nohonorifics_cleancaptions[:10]

['The scene at IDEAL School & Academy’s 10th Annual Gala.',
 'Les Lieberman, Barri Lieberman, Isabel Kallman, Trish Iervolino and Ron Iervolino',
 'Chuck Grodin',
 'Diana Rosario, Ali Sussman, Sarah Boll, Jen Zaleski, Alysse Brennan and Lindsay Macbeth',
 'Kelly and Tom Murro',
 'Udo Spreitzenbarth',
 'Ron Iervolino, Trish Iervolino, Russ Middleton and Lisa Middleton',
 'Barbara Loughlin, Gerald Loughlin and bbie Gelston',
 'Julianne Michelle',
 'Heather Robinson, Kiwan Nichols, Jimmy Nichols, Melanie Carbone and Nancy Brown']

In [180]:
# fixing captions that have couples in them and have unnecessary "ands"
# ALSO identifies captions with married couples
captions_with_couples = []
captions_with_no_and = []

# regex identifies unnessary 'ands' ('....Melanie Carbone AND Nancy Brown')
notsame_family_and_removal_regex = r'[A-Za-z]+\s+[A-Za-z]+\s+and\s+[A-Za-z]+\s+[A-Za-z]'

def and_removal_noncouples(caption, notsame_family_and_removal_regex):
    if len(re.findall(notsame_family_and_removal_regex, caption.strip())) > 0:
        names = caption.split(' and ')
        if len(names) == 2:
            fixed_string = ', '.join([names[0], names[1]])
            captions_with_no_and.append(fixed_string)
    if len(re.findall(notsame_family_and_removal_regex, caption.strip())) == 0:
        names = caption
        captions_with_couples.append(names)
    return

In [182]:
captions_with_couples = []
captions_with_no_and = []

for i in nohonorifics_cleancaptions:
    and_removal_noncouples(i, notsame_family_and_removal_regex)

In [183]:
# regex identifies captions with couples in them
same_family_regex = r'^[A-Za-z]+\s+and\s+[A-Za-z]+\s+[A-Za-z]+'
testcleancaptions_nocouples =[]

# Takes a caption that contains a couple (Mike and Sophie Riddle) 
# turns to (Mike Riddle and Sophie Riddle)
def same_family_couples(caption, same_family_regex):
    fixed_caption = caption
    divided_by_commas = caption.split(',')
    for string in divided_by_commas:
        if len(re.findall(same_family_regex, string.strip())) >0:
            names = string.split(' and ')
            if len(names) == 2:
                first_name = names[0]
                second_full_name = names[1]
                common_last_name = second_full_name.split()[1]
                fixed_string = ' '.join([first_name, common_last_name, ',', second_full_name])
                fixed_caption = fixed_caption.replace(string, fixed_string)
                testcleancaptions_nocouples.append(fixed_caption)
    return testcleancaptions_nocouples

In [185]:
for i in captions_with_couples:
    same_family_couples(i, same_family_regex)

#combining the final list of captions 
testcleancaptions_nocouples
final_clean_list = captions_with_no_and +testcleancaptions_nocouples


In [187]:
final_clean_list[:20]

['Les Lieberman, Barri Lieberman, Isabel Kallman, Trish Iervolino, Ron Iervolino',
 'Diana Rosario, Ali Sussman, Sarah Boll, Jen Zaleski, Alysse Brennan, Lindsay Macbeth',
 'Ron Iervolino, Trish Iervolino, Russ Middleton, Lisa Middleton',
 'Barbara Loughlin, Gerald Loughlin, bbie Gelston',
 'Heather Robinson, Kiwan Nichols, Jimmy Nichols, Melanie Carbone, Nancy Brown',
 'Bill Mack, Les Lieberman',
 'David Lyden, Patricia Sorenson',
 'Jimmy Cayne, Vince Tese, Pat Cayne',
 'Stuart Oran, Les Lieberman, Hilary Oran',
 'Vince Tese, Chuck Grodin',
 'Dwight Gooden, Les Lieberman',
 'Amy Cunningham-Bussel, Ray Mirra, Tyler Janovitz',
 'Dan Shedrick, Samara Heafitz',
 'Bart Scott, Mark Laplander',
 'Mitch Rubin, Audra Zuckerman, Michelle Smith, Kenneth Mehlman, Julia Harquail, John Hackett',
 'Audra Zuckerman, Michelle Smith, Julia Harquail',
 'Judy Poller, Rob Affuso',
 'David Byrnes, Concetta Bencivenga',
 'Henry Kravis, Kenneth Mehlman, Mitch Rubin',
 'Anne Jameson, Jay Eisenhofer']

In [190]:
print('The final length of my list of captions is ',len(final_clean_list))

The final length of my list of captions is  83915


dill.dump(final_clean_list, open('final_clean_presort.pkd', 'w'))

### Name extraction from the captions using NLP

In [222]:
only_names_captions = []
####EXTRACTS only the names from each caption.

# good reference for spacy usage: 
# http://textminingonline.com/getting-started-with-spacy


def spacy_name_extraction(final_clean_list):
    #iterates through each caption
    for i in final_clean_list:
        #tokenize each caption
        doc1 = nlp(i)
        
        #empty temp list where identified names will be inserted
        names = []
        
        #iterates through each token in a caption and identifies the which tokens are names
        for ent in doc1.ents:
            # 378 is token label for names
            if ent.label == 378:
                x = ent.text.strip()
                names.append(x)
        
        #gathers final names into an overall list of names        
        only_names_captions.append(names)
                
    return only_names_captions 

In [223]:
only_names_captions = []

# iterates through entire captions list and returns a list of names in each caption
# can take up to 10 minutes to iterate over all 80k captions.

spacy_name_extraction(final_clean_list)

[['Les Lieberman',
  'Barri Lieberman',
  'Isabel Kallman',
  'Trish Iervolino',
  'Ron Iervolino'],
 ['Diana Rosario',
  'Ali Sussman',
  'Sarah Boll',
  'Jen Zaleski',
  'Alysse Brennan',
  'Lindsay Macbeth'],
 ['Ron Iervolino', 'Trish Iervolino', 'Russ Middleton', 'Lisa Middleton'],
 ['Barbara Loughlin', 'Gerald Loughlin', 'Gelston'],
 ['Heather Robinson',
  'Kiwan Nichols',
  'Jimmy Nichols',
  'Melanie Carbone',
  'Nancy Brown'],
 ['Bill Mack', 'Les Lieberman'],
 ['David Lyden', 'Patricia Sorenson'],
 ['Jimmy Cayne', 'Vince Tese', 'Pat Cayne'],
 ['Stuart Oran', 'Les Lieberman', 'Hilary Oran'],
 ['Vince Tese', 'Chuck Grodin'],
 ['Dwight Gooden', 'Les Lieberman'],
 ['Amy Cunningham-Bussel', 'Ray Mirra', 'Tyler Janovitz'],
 ['Dan Shedrick', 'Samara Heafitz'],
 ['Bart Scott', 'Mark Laplander'],
 ['Mitch Rubin',
  'Audra Zuckerman',
  'Michelle Smith',
  'Kenneth Mehlman',
  'John Hackett'],
 ['Audra Zuckerman', 'Michelle Smith', 'Julia Harquail'],
 ['Judy Poller', 'Rob Affuso'],
 ['Da

The end result is just a list of lists. Each component list contains only the names that appear in every photocaption of photos from every event before 12/1/14.

In [225]:
names = only_names_captions

print(len(names))

83915
