In [1]:
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
import requests
import re
import unicodedata
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import json
import pprint
import pickle
import pandas as pd


import asyncio
if asyncio.get_event_loop().is_running(): # Only patch if needed (i.e. running in Notebook, Spyder, etc)
    import nest_asyncio
    nest_asyncio.apply()

# SoftwareTalks

In [2]:
url = 'https://www.softwaretalks.io/'
page = unicodedata.normalize('NFKD', requests.get(url).text)
doc = BeautifulSoup(page, 'html.parser')

In [3]:
conferences = doc.find_all(class_='conference-item')

In [5]:
software_talks_conferences = {}
for conference in conferences:
    try:
        a_tags = conference.find_all('a')
        conf_title = a_tags[0].text
        conf_url = a_tags[1].text.strip()
        if conf_url[-1] != '/':
            conf_url = f'{conf_url}/'
        if not 'www' in conf_url:
            conf_url = f'{conf_url.split("//")[0]}//www.{conf_url.split("//")[1]}'
    except IndexError:
        continue
    conf_info = conference.p.text.split('.', 1)
    date_info = conf_info[0].split(' ')
    start_date = datetime.strptime(date_info[1], '%d/%m/%Y')
    end_date = datetime.strptime(date_info[3], '%d/%m/%Y')
    
    location_info = conf_info[1].strip()
    online_text = 'Online'
    is_online =  True if online_text in location_info else None
    if is_online:
        location = location_info.split(online_text)[0]
    else:
        location = location_info
    
    city = None
    state = None
    country = None
    if location != '':
        location_details = location.split(',')
        city = location_details[0].strip()
        if len(location_details[1].strip()) == 2:
            state = location_details[1].strip()
        else:
            country = location_details[1].strip()
    if city == '':
        city = None
    if state == '':
        state = None
    if country == '':
        country = None
    if country == 'Czechia':
        country = 'Czech Republic'
    if country != 'United States': 
        continue
        
    conf_topics = []
        
    software_talks_conferences[f'{conf_title} -- {str(start_date).split(" ")[0]}'] = {
        'title': conf_title.strip(),
        'url': conf_url.strip(),
        'start': start_date,
        'end': end_date,
        'online': is_online,
        'topics': conf_topics,
        'country': country.strip() if country else country,
        'city': city.strip() if city else city,
        'state': state.strip() if state else state,
        'source': ['SoftwareTalks'],
    }

# Confs Tech

In [6]:
url = 'https://confs.tech/#'
asession = AsyncHTMLSession()
response = await asession.get(url)
await response.html.arender()
html = unicodedata.normalize('NFKD', response.html.html)
doc = BeautifulSoup(html, 'html.parser')

In [7]:
conferences = doc.find_all('li', class_='ConferenceItem_ConferenceItem__Hnn7O')
len(conferences)

313

In [8]:
confs_tech_conferences = {}
for conference in conferences:
    conf_title = unicodedata.normalize('NFKD', conference.a.text)
    conf_url = unicodedata.normalize('NFKD', conference.a.get('href'))
    if conf_url[-1] != '/':
        conf_url = f'{conf_url}/'
    if not 'www' in conf_url:
        conf_url = f'{conf_url.split("//")[0]}//www.{conf_url.split("//")[1]}'
    
    conf_info = conference.p.text.split('・')
    location_info = unicodedata.normalize('NFKD', conf_info[0])
    is_online = None
    if 'Online' in location_info:
        is_online = True
        location = location_info.split('Online')[0]
        if '&' in location:
            location = location.split('&')[0].strip()
    else:
        location = location_info
    city_country = location.split(',')
    city = None
    state = None
    country = None
    if len(city_country) == 2:
        city = city_country[0].strip()
        country = city_country[1].strip()
        state = None
    elif len(city_country) == 3:
        city = city_country[0].strip()
        state = city_country[1].strip()
        country = city_country[2].strip()
    if country == 'U.S.A.':
        country = 'United States'
    if country == 'U.K.':
        country = 'United Kingdom'
    
    if country != 'United States': 
        continue
        
    date_info = unicodedata.normalize('NFKD', conf_info[1])
    
    topics = conference.find_all('li', {'class': 'ConferenceItem_topic__3xZ8s'})
    conf_topics = []
    for topic in topics:
        conf_topics.append(unicodedata.normalize('NFKD', topic.text.split('#')[1]).lower())
    
    if '-' in date_info:
        start_date = datetime.strptime(f'{date_info.split("-")[0]} {datetime.now().year}', '%B %d %Y')
        end_date = date_info.split('-')[1].strip()
        regexp = re.compile(r'[a-zA-Z]+')
        if not regexp.search(end_date):
            month = regexp.search(date_info).group(0)
            end_date = f'{month} {end_date}'
        end_date = datetime.strptime(f'{end_date} {datetime.now().year}', '%B %d %Y')
    else:
        start_date = datetime.strptime(f'{date_info} {datetime.now().year}', '%B %d %Y')
        end_date = datetime.strptime(f'{date_info} {datetime.now().year}', '%B %d %Y')
    
    confs_tech_conferences[f'{conf_title} -- {str(start_date).split(" ")[0]}'] = {
        'title': conf_title.strip(),
        'url': conf_url.strip(),
        'start': start_date,
        'end': end_date,
        'online': is_online,
        'topics': conf_topics,
        'country': country.strip() if country else country,
        'city': city.strip() if city else city,
        'state': state.strip() if state else state,
        'source': ['Confs.Tech'],
    }

# Dev Events America

In [9]:
url = 'https://dev.events/AM'
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get(url)
for _ in range(3):
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//a[contains(text(), "Show more")]')))
        print('Finished waiting')
        break
    except TimeoutException:
        print('Wait timeout exceeded')
    print('Trying again')
for i in range(100):
    try:
        driver.find_element(By.XPATH, '//a[contains(text(), "Show more")]').click()
        print('Clicked show more')
    except NoSuchElementException as e:
        print('Have entire page')
        break
    if i == 99:
        print('Tried 100 times')

  options.headless = True


Finished waiting
Clicked show more
Clicked show more
Clicked show more
Clicked show more
Have entire page


In [10]:
html = unicodedata.normalize('NFKD', driver.page_source)
doc = BeautifulSoup(html, 'html.parser')
conferences = doc.find_all('div', class_='event')
len(conferences)

126

In [11]:
dev_events_conferences = {}
for conference in conferences:
    if 'featured' in conference.get('class'):
        continue
    data = json.loads(conference.find('script', type='application/ld+json').text)
    conf_title = data['name']
    try:
        conf_url = data['organizer']['url']
    except:
        conf_url = data['offers'][0]['url']
    if conf_url[-1] != '/':
        conf_url = f'{conf_url}/'
    if not 'www' in conf_url:
        conf_url = f'{conf_url.split("//")[0]}//www.{conf_url.split("//")[1]}'
    start_date = datetime.strptime(data['startDate'].split('T')[0], '%Y-%m-%d')
    end_date = datetime.strptime(data['endDate'].split('T')[0], '%Y-%m-%d')
    location_string = unicodedata.normalize('NFKD', conference.h3.span.text.split('in')[1])
    city = data['location']['address']['addressLocality']
    country = data['location']['address']['addressRegion']
    if len(location_string.split(',')) > 1:
        state = location_string.split(',')[1].strip()
    else:
        state = None
    is_online = None
    
    conf_topics = [val.strip().lower() for val in unicodedata.normalize('NFKD', conference.h3.a.text).split('/')]
    
    if country != 'United States': 
        continue
    
    dev_events_conferences[f'{conf_title} -- {str(start_date).split(" ")[0]}'] = {
        'title': conf_title.strip(),
        'url': conf_url.strip(),
        'start': start_date,
        'end': end_date,
        'online': is_online,
        'topics': conf_topics,
        'country': country.strip() if country else country,
        'city': city.strip() if city else city,
        'state': state.strip() if state else state,
        'source': ['Dev.Events'],
    }

In [12]:
st_keys = list(software_talks_conferences.keys())
ct_keys = list(confs_tech_conferences.keys())
de_keys = list(dev_events_conferences.keys())

In [13]:
all_keys = []
all_keys.extend(st_keys)
all_keys.extend(ct_keys)
all_keys.extend(de_keys)
unique_keys = set(all_keys)

In [15]:
all_conferences = []
all_urls = []
for key in unique_keys:
    conference_info = None
    if key in st_keys:
        conference_info = software_talks_conferences[key]
    if key in ct_keys:
        conf =  confs_tech_conferences[key]
        if conference_info == None:
            conference_info = conf
        else:
            for k in conf:
                if conf[k] != conference_info[k]:
                    if (conference_info[k] == None or conference_info[k] == 'UNKNOWN') and conf[k] != None:
                        conference_info[k] = conf[k]
                    elif conference_info[k] != None and conf[k] == None:
                        continue
                    elif k == 'topics':
                        conference_info[k].extend(conf[k])
                        conference_info[k] = list(set(conference_info[k]))
                    elif k == 'url':
                        if conf[k] in conference_info[k]:
                            conference_info[k] = conf[k]
                        elif conference_info[k] in conf[k]:
                            continue
                    elif k == 'source':
                        conference_info[k].extend(conf[k])
                        conference_info[k] = list(set(conference_info[k]))
                    else:
                        print(f'Review: "{key}"\nDifferences for key: "{k}"\n"{conference_info[k]}" vs "{conf[k]}"\n')
    if key in de_keys:
        conf =  dev_events_conferences[key]
        if conference_info == None:
            conference_info = conf
        else:
            for k in conf:
                if conf[k] != conference_info[k]:
                    if (conference_info[k] == None or conference_info[k] == 'UNKNOWN') and conf[k] != None :
                        conference_info[k] = conf[k]
                    elif conference_info[k] != None and conf[k] == None:
                        continue
                    elif k == 'topics':
                        conference_info[k].extend(conf[k])
                        conference_info[k] = list(set(conference_info[k]))
                    elif k == 'url':
                        if conf[k] in conference_info[k]:
                            conference_info[k] = conf[k]
                        elif conference_info[k] in conf[k]:
                            continue
                    elif k == 'source':
                        conference_info[k].extend(conf[k])
                        conference_info[k] = list(set(conference_info[k]))
                    else:
                        print(f'Review: "{key}"\nDifferences for key: "{k}"\n"{conference_info[k]}" vs "{conf[k]}"\n')
    if not conference_info['url'] in all_urls:
        all_conferences.append(conference_info)
    else:
        print(conference_info['url'])
        similar_conference = next((x for x in all_conferences if x['url'] == conference_info['url']))
        if len(conference_info['title']) > len(similar_conference['title']):
            similar_conference['title'] = conference_info['title']
        similar_conference['topics'].extend(conference_info['topics'])
        similar_conference['topics'] = list(set(similar_conference['topics']))
        similar_conference['source'].extend(conference_info['source'])
        similar_conference['source'] = list(set(similar_conference['source']))
        if (similar_conference['city'] != conference_info['city']):
            print(f'Review: "{similar_conference["title"]}"\n"City" has 2 options\n"{similar_conference["city"]}" vs "{conference_info["city"]}"\n')
    all_urls.append(conference_info['url'])

                        
            

Review: "Devopsdays Denver -- 2023-04-23"
Differences for key: "end"
"2023-04-24 00:00:00" vs "2023-04-25 00:00:00"

https://www.usenix.org/conference/srecon23americas/
https://www.datascience.salon/miami/
https://www.reactmiami.com/
https://www.kcdc.info/
https://www.libertyjs.com/
https://www.qconnewyork.com/
Review: "Qcon New York"
"City" has 2 options
"New York" vs "Brooklyn"

https://www.vueconf.us/
https://www.kcdc.info/
https://www.uberconf.com/
https://www.hardwear.io/usa-2023/
https://www.2023.clojure-conj.org/
https://www.qconsf.com/
https://www.tek.phparch.com/
https://www.dataconnectconf.com/
https://www.smashingconf.com/sf-2023/
https://www.devnexus.com/
https://www.pydata.org/seattle2023/
https://www.laracon.us/
https://www.devopsdays.org/events/2023-washington-dc/welcome/
Review: "Devopsdays Washington, DC"
"City" has 2 options
"Washington" vs "Boise"

https://www.odsc.com/boston/


In [33]:
all_conferences

[{'title': 'SAFe Summit Nashville',
  'url': 'https://www.safesummit.com/',
  'start': datetime.datetime(2023, 8, 15, 0, 0),
  'end': datetime.datetime(2023, 8, 18, 0, 0),
  'online': None,
  'topics': ['agile'],
  'country': 'United States',
  'city': 'Nashville',
  'state': 'TN',
  'source': ['Dev.Events']},
 {'title': 'Lonestar Software Symposium : Austin',
  'url': 'https://www.nofluffjuststuff.com/austin/',
  'start': datetime.datetime(2023, 7, 14, 0, 0),
  'end': datetime.datetime(2023, 7, 15, 0, 0),
  'online': True,
  'topics': ['devops', 'java'],
  'country': 'United States',
  'city': 'Austin',
  'state': 'TX',
  'source': ['Confs.Tech']},
 {'title': 'Augmented Enterprise Summit',
  'url': 'https://www.augmentedenterprisesummit.com/',
  'start': datetime.datetime(2023, 10, 24, 0, 0),
  'end': datetime.datetime(2023, 10, 26, 0, 0),
  'online': None,
  'topics': ['ar', 'vr', 'xr'],
  'country': 'United States',
  'city': 'Houston',
  'state': 'TX',
  'source': ['Dev.Events']},


# Save Conferences as Pickle

In [24]:
with open('conferences.pickle', 'wb') as handle:
    pickle.dump(all_conferences, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Read Conference into dictionary

In [25]:
with open('conferences.pickle', 'rb') as handle:
    test = pickle.load(handle)

# Save Conferences as excel file

In [26]:
df = pd.DataFrame(all_conferences)
df = df.astype({'start': str, 'end': str})

# create an Excel writer object
writer = pd.ExcelWriter('conferences.xlsx')

# write the dataframe to the Excel sheet
df.to_excel(writer, index=False)

# save the Excel file
writer.save()

  writer.save()


In [27]:
df.head()

Unnamed: 0,title,url,start,end,online,topics,country,city,state,source
0,SAFe Summit Nashville,https://www.safesummit.com/,2023-08-15,2023-08-18,,[agile],United States,Nashville,TN,[Dev.Events]
1,Lonestar Software Symposium : Austin,https://www.nofluffjuststuff.com/austin/,2023-07-14,2023-07-15,True,"[devops, java]",United States,Austin,TX,[Confs.Tech]
2,Augmented Enterprise Summit,https://www.augmentedenterprisesummit.com/,2023-10-24,2023-10-26,,"[ar, vr, xr]",United States,Houston,TX,[Dev.Events]
3,UXDX USA 2023,https://www.uxdx.com/usa/2023/,2023-05-16,2023-05-18,,"[product, ux]",United States,New York,NY,[Dev.Events]
4,PAX East,https://www.east.paxsite.com/en-us.html/,2023-03-23,2023-03-26,,[game dev],United States,Boston,MA,[Dev.Events]


In [28]:
test = ['a', 'b', 'c']
d = [1, 2, 3]
test.extend(d)

In [29]:
test

['a', 'b', 'c', 1, 2, 3]