# GeoInt Conference  Analysis
The purpose of this notebook is to explore the GeoInt 2024 website and to perform some basic analytics.

The not book consists of several components:
* Web scrapping
* Data prep
* Data Visualization

## Web Scrapping
<p>Web scraping is a technique of extracting data from websites by simulating the behavior of a web browser. Web scraping can be useful for various purposes, such as market research, content analysis, or data journalism. However, web scraping also raises ethical and legal issues, especially regarding the privacy and copyright of the website owners and users. Therefore, it is important to follow the robots.txt file, which is a standard protocol that specifies which parts of a website can or cannot be scraped by automated agents.</p>

<p>The robot file for GeoInt can be found here: <a href='https://geoint24.mapyourshow.com/robots.txt'>robot.txt</a></p>

<p>Prior to GenAI web scraping required significantly more skill, time and external packages; however, now using language models you can either extract the data through prompting or have a language model write complex regex statements for you in seconds. The second method can have its advantages if you will be iterating over hundreds or thousands of pages sharing a standard format.</p>
<p>
Note: all scraping functionality in this notebook was create in whole or in part by various LLMs</p>.


### Imports and Helper Functions

In [12]:
import sys
import requests
import re
import pandas as pd
import os

def fetch_text(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text    

# extract the session title from the text using regex
def extract_session_title(text):
    pattern = r'<title>(.*?)<\/title>'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return None
    
def extract_speaker_name(html_content):
    # Regex pattern to find content within <h1> tags with specific classes
    pattern = r'<h1 class="[^"]*">\s*([^\n\r<]+)\s*</h1>'
    # Search for the pattern in the HTML content
    match = re.search(pattern, html_content)
    # Return the matched group if found, otherwise return None
    if match:
        return match.group(1)
    return None

def extract_job_title(html_content):
    # Regex pattern to find content within <div> tags with specific classes that likely contain job titles
    pattern = r'<div class="muted\s+mb3\s+f2\s+lh-title">\s*([^<]+)\s*</div>'
    # Search for the pattern in the HTML content
    match = re.search(pattern, html_content)
    # Return the matched group if found, otherwise return None
    if match:
        speaker_title = match.group(1)
        speaker_title = re.sub(r'\s+', ' ', speaker_title)
        
        
        return speaker_title
    return None

def extract_bio(html_content):
    # Regex pattern to find content within <p> tags with the class "o-DynamicContent text f5 lh-copy mb0"
    pattern = r'<p class="o-DynamicContent\s+text\s+f5\s+lh-copy\s+mb0">\s*([\s\S]*?)\s*</p>'
    # Search for the pattern in the HTML content
    match = re.search(pattern, html_content)
    # Return the matched group if found, otherwise return None
    if match:
        speaker_bio = match.group(1).strip()

        # remove newline characters
        speaker_bio = re.sub(r'\s+', ' ', speaker_bio)


        return speaker_bio
    return "Bio not found"


def extract_description(html_content):
    # Regex pattern to find content within the 'content' attribute of a <meta> tag with the name 'description'
    pattern = r'<meta name="description" content="([^"]+)"\s*/?>'
    # Search for the pattern in the HTML content
    match = re.search(pattern, html_content)
    # Return the matched group if found, otherwise return None
    if match:
        return match.group(1)
    return "Session description not found"


def extract_session_type(html_text):
    # Regex pattern to find the session type following the specific structure
    pattern = r'<span class="break-word\s+lh-list\s+muted\s+pr2\s+b">Type:</span>\s*<span class="break-word\s+lh-list">\s*([^<]+)\s*</span>'
    match = re.search(pattern, html_text)
    if match:
        return match.group(1).strip()  # Return the captured session type, stripping any extra whitespace
    return None



### Setup and Configuration

In [13]:
# Define the base urls
session_base_url = 'https://geoint24.mapyourshow.com/8_0/sessions/session-details.cfm?scheduleid='
speaker_base_url = 'https://geoint24.mapyourshow.com/8_0/sessions/speaker-details.cfm?speakerid='

# Define the delay if needed and wait time (Following robot.txt)
crawl_delay = 0

# Set the maximum allowed failed attempts
max_failed_attempts = 10
max_attempts = 1000

# Set the output path
output_base_path = '../data/geoint/'

# Check if a directory exists, if it does not exist, create it

if not os.path.exists(output_base_path):
    os.makedirs(output_base_path)

### Scrub Speaker Info

In [3]:
speaker_names = []
speaker_titles = []
speaker_bios = []
speaker_urls = []

i = 1  # Start with the first speaker
failed_attempts = 0
while failed_attempts < max_failed_attempts:
    url = speaker_base_url + str(i)

    text = fetch_text(url)
    
    # Extract details
    name = extract_speaker_name(text)
    title = extract_job_title(text)
    bio = extract_bio(text)

    # Check for failures in extraction
    if not title:
        failed_attempts += 1
        print(f"Failed to extract data for speaker ID {i}. Total failures: {failed_attempts}")
    else:
        failed_attempts = 0
        
        speaker_names.append(name)
        speaker_titles.append(title)
        speaker_bios.append(bio)
        speaker_urls.append(url)

    # used for debugging 
    #if i % 10 == 0:
        #print(i)
    
    i += 1  # Move to the next speaker ID

    if crawl_delay > 0:
        time.sleep(crawl_delay)

    # Optionally, you can add a break condition if you expect only a certain number of speakers
    if i > max_attempts:
         break


Failed to extract data for speaker ID 6. Total failures: 1
Failed to extract data for speaker ID 29. Total failures: 1
Failed to extract data for speaker ID 34. Total failures: 1
Failed to extract data for speaker ID 47. Total failures: 1
Failed to extract data for speaker ID 50. Total failures: 1
Failed to extract data for speaker ID 51. Total failures: 2
Failed to extract data for speaker ID 54. Total failures: 1
Failed to extract data for speaker ID 56. Total failures: 1
Failed to extract data for speaker ID 59. Total failures: 1
Failed to extract data for speaker ID 61. Total failures: 1
Failed to extract data for speaker ID 62. Total failures: 2
Failed to extract data for speaker ID 63. Total failures: 3
Failed to extract data for speaker ID 64. Total failures: 4
Failed to extract data for speaker ID 68. Total failures: 1
Failed to extract data for speaker ID 70. Total failures: 1
Failed to extract data for speaker ID 73. Total failures: 1
Failed to extract data for speaker ID 75.

In [8]:
# Create a dataframes with the extracted data
df_speakers = pd.DataFrame({'Name': speaker_names, 'Title': speaker_titles, 'Bio': speaker_bios,'url': speaker_urls})

# Clear variables
#del speaker_names, speaker_titles, speaker_bios

df_speakers.head()

Unnamed: 0,Name,Title,Bio,url
0,Robert Cardillo,"Chair, USGIF Board of Directors at USGIF",Robert Cardillo is the president of The Cardil...,https://geoint24.mapyourshow.com/8_0/sessions/...
1,Ronda Schrenk,Chief Executive Officer at USGIF,Ronda Schrenk is the Chief Executive Officer f...,https://geoint24.mapyourshow.com/8_0/sessions/...
2,Christy Monaco,Vice President of Programs at USGIF,,https://geoint24.mapyourshow.com/8_0/sessions/...
3,Tara Mott,Account Manager at Esri,,https://geoint24.mapyourshow.com/8_0/sessions/...
4,Jeff Dawley,Director of Intelligence Programs at Esri,Jeff Dawley currently serves as the Director o...,https://geoint24.mapyourshow.com/8_0/sessions/...


### Scrap Session Content

In [5]:
session_titles = []
session_descriptions = []
session_types = []
session_urls = []

i = 1  # Start with the first speaker
failed_attempts = 0
while failed_attempts < max_failed_attempts:
    url = session_base_url + str(i)

    text = fetch_text(url)
    
    # Extract details
    title = extract_session_title(text)
    description = extract_description(text)
    type = extract_session_type(text)


    # Check for failures in extraction
    if not type:
        failed_attempts += 1
        print(f"Failed to extract data for session ID {i}. Total failures: {failed_attempts}")
    else:
        failed_attempts = 0
        
        session_titles.append(title)
        session_descriptions.append(description)
        session_types.append(type)
        session_urls.append(url)

    # used for debugging 
    #if i % 10 == 0:
        #print(i)
        
    i += 1  # Move to the next speaker ID

    if crawl_delay > 0:
        time.sleep(crawl_delay)

    # Optionally, you can add a break condition if you expect only a certain number of speakers
    if i > max_attempts:
         break

Failed to extract data for session ID 4. Total failures: 1
Failed to extract data for session ID 5. Total failures: 2
10
Failed to extract data for session ID 16. Total failures: 1
Failed to extract data for session ID 17. Total failures: 2
Failed to extract data for session ID 18. Total failures: 3
20
Failed to extract data for session ID 21. Total failures: 1
Failed to extract data for session ID 22. Total failures: 2
Failed to extract data for session ID 23. Total failures: 3
Failed to extract data for session ID 26. Total failures: 1
30
40
50
Failed to extract data for session ID 59. Total failures: 1
60
70
Failed to extract data for session ID 79. Total failures: 1
80
Failed to extract data for session ID 90. Total failures: 1
90
Failed to extract data for session ID 92. Total failures: 1
100
110
Failed to extract data for session ID 117. Total failures: 1
120
Failed to extract data for session ID 128. Total failures: 1
Failed to extract data for session ID 130. Total failures: 1


In [9]:
# Create a dataframes with the extracted data
session_df = pd.DataFrame({'Title': session_titles, 'Description': session_descriptions, 'Type': session_types,'url': session_urls})
session_df.head()

Unnamed: 0,Title,Description,Type,url
0,GEOINT Foreword Welcome and Opening Remarks - ...,Join us as we kick off GEOINT Foreword with US...,Keynote Presentation,https://geoint24.mapyourshow.com/8_0/sessions/...
1,"Keynote&#x3a; Dare King, Chief Operating Offic...",Session description not found,Keynote Presentation,https://geoint24.mapyourshow.com/8_0/sessions/...
2,Panel&#x3a; Quantum Leap in Geospatial Intelli...,We’ll begin the day’s exploration of “Domain D...,Panel Discussion,https://geoint24.mapyourshow.com/8_0/sessions/...
3,Lunch &amp; Student Poster Session - GEOINT 20...,Session description not found,Networking Events,https://geoint24.mapyourshow.com/8_0/sessions/...
4,Panel&#x3a; PNT Beyond GPS - GEOINT 2024 Sympo...,The advent of the Global Positioning System ha...,Panel Discussion,https://geoint24.mapyourshow.com/8_0/sessions/...


In [16]:
# Save the dataframes
session_df.to_csv(output_base_path+'sessions.csv', index=False)
df_speakers.to_csv(output_base_path+'speakers.csv', index=False)

## Analysis and Visualization

### Imports and Helper Functions


In [18]:
!pip install wordcloud

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting wordcloud
  Obtaining dependency information for wordcloud from https://files.pythonhosted.org/packages/90/be/1a7a488f5edcfae6746ffb91e792a1795b6cc058364ea6888b3878d3476f/wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.1/511.1 kB[0m [31m557.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: wordcloud
Successfully installed wordcloud-1.9.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34

In [20]:
from wordcloud import WordCloud

# Create a list of word from the list of descriptions
def extract_words(strings):
    words = []
    for string in strings:
        words.extend(string.split())

        # Remove non-alphabetic characters
        words = [re.sub(r'\W+', '', word) for word in words]

        # Remove empty strings
        words = [word for word in words if word]

        # Convert all words to lowercase
        words = [word.lower() for word in words]

    return words

def visualize_word_cloud(words):
    # Join all the strings in the list to a single string
    text = ' '.join(words)
    
    # Create a word cloud object
    wordcloud = WordCloud(width=1200, height=800, background_color='white').generate(text)
    
    # Display the word cloud using matplotlib
    plt.figure(figsize=(30, 15))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Hide the axes
    plt.show()

### Pre-Processing

In [22]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

words = extract_words(session_df.Description)

removal_words = ['session','sessions','description','geoint','will','show','talk','presentation','attendees','attendee','attend','event',
                 'discuss','discussions','discussing','discussed','discusses','discussant','discussants','discussing','discus','discussed'
                 'lightning']

# Remove the words in the removal_words list from the words list
words = [word for word in words if word not in removal_words]

# Example usage:
visualize_word_cloud(words)

ImportError: The _imagingft C module is not installed