In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_date_me_directory():
    # Fetch the webpage
    url = 'https://dateme.directory/browse'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'id': 'docs'})
    
    if not table:
        print("Could not find the data table")
        return None

    # Prepare data storage
    data = []
    columns = ['Name', 'ProfileLink', 'Age', 'Gender', 'InterestedIn', 'Style', 
               'Location', 'LocationFlexibility', 'Contact', 'LastUpdated']

    # Extract table rows
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all('td')
        if len(cells) != 9:
            continue

        # Extract name and link
        name_link = cells[0].find('a')
        name = name_link.text.strip() if name_link else cells[0].div.text.strip()
        profile_link = name_link['href'] if name_link else ''

        # Extract remaining data
        row_data = {
            'Name': name,
            'ProfileLink': profile_link,
            'Age': cells[1].div.text.strip(),
            'Gender': cells[2].div.text.strip(),
            'InterestedIn': ', '.join([span.text.strip() for span in cells[3].div.find_all('span')]) or cells[3].div.text.strip(),
            'Style': ', '.join([span.text.strip() for span in cells[4].div.find_all('span')]) or cells[4].div.text.strip(),
            'Location': ', '.join([span.text.strip() for span in cells[5].div.find_all('span')]) or cells[5].div.text.strip(),
            'LocationFlexibility': cells[6].div.text.strip(),
            'Contact': cells[7].div.text.strip(),
            'LastUpdated': cells[8].div.text.strip()
        }
        data.append(row_data)

    # Create DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urlparse

def extract_google_doc_text(url):
    """Extract text from a Google Docs URL"""
    try:
        # Convert to export URL
        doc_id = url.split('/d/')[1].split('/')[0]
        export_url = f'https://docs.google.com/document/d/{doc_id}/export?format=txt'
        
        response = requests.get(export_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }, timeout=10)
        
        if response.status_code == 200:
            return response.text
        return "Could not extract document content"
    
    except Exception as e:
        return f"Error extracting document: {str(e)}"

def extract_general_page_text(url):
    """Extract text from a general webpage"""
    try:
        response = requests.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup.get_text(separator='\n', strip=True)
        return "Could not extract page content"
    
    except Exception as e:
        return f"Error extracting page: {str(e)}"

def get_dating_data(df_row):
    link = df_row['ProfileLink']
    if not link:
        df_row['ProfileText'] = 'No link available'
        return df_row
    
    try:
        # Check if Google Doc
        if 'docs.google.com/document/' in link:
            text = extract_google_doc_text(link)
        else:
            text = extract_general_page_text(link)
        
        df_row['ProfileText'] = text
        time.sleep(1)  # Be polite with requests
        
    except Exception as e:
        df_row['ProfileText'] = f"Error processing link: {str(e)}"
    
    return df_row

# Modify your main block to use this:
if __name__ == "__main__":
    df = scrape_date_me_directory()
    if df is not None:
        print("Processing profile links...")
        # Process first 5 rows as example (remove [0:5] to process all)
        df = df.apply(get_dating_data, axis=1)
        
        print("\nSample results:")
        print(df[['Name', 'ProfileLink', 'ProfileText']].head())
        
        # To save full results:
        # df.to_csv('date_me_directory_with_text.csv', index=False)
    else:
        print("Failed to scrape data")

Processing profile links...

Sample results:
         Name                                        ProfileLink  \
0       Gabin  https://docs.google.com/document/d/1ymul6X3DQH...   
1   Malvin G.  https://docs.google.com/document/d/1zuWjBGbGVz...   
2      Callum  https://docs.google.com/document/d/16Li0PY9WG5...   
3  Ari Zerner                     https://arizerner.com/date-me/   
4      Aeneas  https://docs.google.com/document/d/e/2PACX-1vS...   

                                         ProfileText  
0  ﻿Hi!\r\nThis is a dating doc, I’m a 25 year ol...  
1                 Could not extract document content  
2  ﻿In as many words\r\n\r\n\r\nTo love and be lo...  
3  Ari's Date Me Doc | Ari Zerner's Demesne\nAri ...  
4                 Could not extract document content  


In [17]:
df.to_csv('date_me_directory_with_text.csv', index=False)

In [18]:
df

Unnamed: 0,Name,ProfileLink,Age,Gender,InterestedIn,Style,Location,LocationFlexibility,Contact,LastUpdated,ProfileText
0,Gabin,https://docs.google.com/document/d/1ymul6X3DQH...,25,M,"F, NB",Any,"Central Europe, South of France",Some,GabinDatingDoc@proton.me,2025-03-07,"﻿Hi!\r\nThis is a dating doc, I’m a 25 year ol..."
1,Malvin G.,https://docs.google.com/document/d/1zuWjBGbGVz...,34,M,F,mono,"San Francisco Bay Area, NYC, Central Europe, D...",Flexible,https://docs.google.com/forms/d/1-opeZgl3qdr4p...,2025-03-07,Could not extract document content
2,Callum,https://docs.google.com/document/d/16Li0PY9WG5...,40,M,F,mono,UK,Flexible,callumcallum109@outlook.com,2025-03-04,﻿In as many words\r\n\r\n\r\nTo love and be lo...
3,Ari Zerner,https://arizerner.com/date-me/,27,M,"F, NB",poly,"San Francisco Bay Area, Philadelphia",Some,ari@zerner.com,2025-02-24,Ari's Date Me Doc | Ari Zerner's Demesne\nAri ...
4,Aeneas,https://docs.google.com/document/d/e/2PACX-1vS...,29,M,F,mono,Flexible,Flexible,aokook@gmail.com,2025-02-20,Could not extract document content
...,...,...,...,...,...,...,...,...,...,...,...
477,Blaïse (bless),https://docs.google.com/document/d/1ndGs9G9SvG...,38,NB,M,mono,Ladysmith BC,,Academia_nut@shaw.ca,2022-10-14,﻿Dating Stephen Blaïse Saint Clare\r\n\r\n\r\n...
478,Brian Tomasik,https://briantomasik.com/my-dating-profile/,35,M,"F, NB",mono,NY,Flexible,,2022-10-01,My dating profile\nHome\n•\nWritings\nMy datin...
479,Wes,https://docs.google.com/document/d/1w2jH3CRis1...,39,M,F,poly,"Collingswood, NJ",,,2022-09-29,"﻿BESPOKE 1983 NONMONOGAMOUS DAD MODEL, AVAILAB..."
480,Mercer,https://docs.google.com/document/d/1FUcf4P06yV...,22,M,,,NYC,,,2022-09-29,Could not extract document content
