# Building the profile database with MongoDB

### Prerequisites

- MongoDB must be installed, your local database must be running, the collections should already be created
- you should run this notebook in your virtual environment with all dependencies installed
- see the notebook `Interactive Guide from Scratch.ipynb` for instructions
- simply run all cells in order

### Add all persons to our collection

In [3]:
import json
from pymongo import MongoClient

In [5]:
# Initialize the client
client = MongoClient()

# The database on the client we're connecting to
db = client['zhaw_matchmaking']

# The collection in the database we are using
persons_collection = db['persons']

In [6]:
# Read the provided JSON which contains the persons
with open('combined.json', 'r') as json_file:
    data = json.load(json_file)

# Add them to our db
for person in data:
    # Add the "status" field with an initial value of "pending" for profile retrieval
    person['status'] = 'pending'
    
    persons_collection.insert_one(person)

print("Done.")

Done.


In [13]:
# Check if the number of entries are correct
print(f"Persons in JSON: {len(data)}\nEntries added to database: {persons_collection.count_documents({})}")

Persons in JSON: 4803
Entries added to database: 4803


## Add the profile data to our collection

- iterate over each person in the `persons` collection, scrape the raw data, and insert it into the `profile_data` collection

In [17]:
import time
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

In [18]:
# The collection in the database we are using
profile_data_collection = db['profile_data']

### Function to retrieve a person's profile data

In [19]:
def get_raw_profile(url):
    try:
        response = requests.get(url)

        # Raise an exception for non-200 status codes
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        profile = soup.find('div', class_='zhaw-person')

        if profile:
            return profile.get_text()
        else:
            raise Exception(f"Profile data not found on the page.\nURL: {url}")

    except requests.exceptions.RequestException as e:
        # Handle request exceptions and raise a custom error
        raise Exception(f"Error while fetching data from {url}: {str(e)}")

    except Exception as e:
        # Handle other exceptions that may occur during scraping
        raise Exception(f"Error during scraping from {url}: {str(e)}")

### Main loop
- this step takes quite a while (~2h)

In [None]:
# Get a progress bar for the loop
total_persons = persons_collection.count_documents({})  # Total number of persons
progress_bar = tqdm(total=total_persons, unit="person")

request_count = 0

# Only choose persons with 'status': 'pending' (profile not yet retrieved)
for person in persons_collection.find({'status': 'pending'}):
    try:
        # Get the profile data
        rawProfileData = get_raw_profile(person['profileURL'])
        
        # Insert raw data into the profile_data collection
        inserted_id = profile_data_collection.insert_one({
            "raw_data": rawProfileData,
            "person_id": person["_id"]
        }).inserted_id
    
        # Update the person in the persons_collection with the reference to the raw data
        persons_collection.update_one(
            {"_id": person["_id"]},
            {"$set": {"profile_data_id": inserted_id}}
        )

        # If the processing is successful, update the status to "success"
        persons_collection.update_one(
            {"_id": person["_id"]},
            {"$set": {"status": "success"}}
        )

        # To avoid overloading the server / getting blocked
        request_count += 1
    
        # After 10 requests, pause for 1 second and reset counter
        if request_count >= 10:
            time.sleep(1)  # pause for 1 second
            request_count = 0

    except Exception as e:
        # If there's an error, update the status to "error" and log the error
        persons_collection.update_one(
            {"_id": person["_id"]},
            {"$set": {"status": "error"}}
        )
        print(f"Error processing {person['name']}: {str(e)}")

    # Update the progress bar
    finally:
        progress_bar.update(1)
        

progress_bar.close()

In [28]:
num_successful = persons_collection.count_documents({'status': 'success'})

print(f"======\nResult\n======\n\nSuccessful: {num_successful / total_persons * 100:.2f}%\nMissing: {total_persons - num_successful} profiles")

Result

Successful: 91.19%
Missing: 423 profiles


## Once we're finished

In [None]:
# Close connection to the database
client.close