## Environment Setup & Dependencies

In [1]:
%pip install requests pandas openpyxl
import requests
import pandas as pd
import re
import os
import sqlite3
import time

Note: you may need to restart the kernel to use updated packages.


## BGS API Connection Discovery


In [2]:
discovery_url = "https://ogcapi.bgs.ac.uk/collections/world-mineral-statistics/items?limit=5"

try:
    response = requests.get(discovery_url)
    response.raise_for_status() # This will catch if the website is down
    
    data = response.json()
    # In OGC APIs, the data is inside 'features', and the actual columns are in 'properties'
    sample_data = [f['properties'] for f in data['features']]
    discovery_df = pd.DataFrame(sample_data)
    
    print("‚úÖ API Connection Successful!")
    print(f"\nTotal columns found: {len(discovery_df.columns)}")
    print(f"Column Names: {discovery_df.columns.tolist()}")
    
    # Show the data
    display(discovery_df.head(3))

except Exception as e:
    print(f"‚ùå Connection failed. Error: {e}")

‚úÖ API Connection Successful!

Total columns found: 22
Column Names: ['country_trans', 'bgs_sub_commodity_trans', 'concat_table_notes_code', 'country_iso2_code', 'erml_group', 'concat_table_notes_text', 'country_iso3_code', 'erml_commodity', 'concat_figure_notes_code', 'year', 'pole_of_inaccessibility_longitude', 'erml_sub_commodity', 'concat_figure_notes_text', 'pole_of_inaccessibility_latitude', 'cgi_commodity_url', 'yearbook_table_id', 'quantity', 'yearbook_table_trans', 'bgs_commodity_code', 'units', 'bgs_statistic_type_trans', 'bgs_commodity_trans']


Unnamed: 0,country_trans,bgs_sub_commodity_trans,concat_table_notes_code,country_iso2_code,erml_group,concat_table_notes_text,country_iso3_code,erml_commodity,concat_figure_notes_code,year,...,concat_figure_notes_text,pole_of_inaccessibility_latitude,cgi_commodity_url,yearbook_table_id,quantity,yearbook_table_trans,bgs_commodity_code,units,bgs_statistic_type_trans,bgs_commodity_trans
0,Burkina Faso,,,BF,Silver,,BFA,"Silver (mine production, metal content)",,2006-01-01T00:00:00,...,,12.591396,http://resource.geosciml.org/classifier/cgi/co...,128,0.0,Mine production of silver,1995,kilograms (metal content),Production,"silver, mine"
1,Burkina Faso,,,BF,Silver,,BFA,"Silver (mine production, metal content)",,2007-01-01T00:00:00,...,,12.591396,http://resource.geosciml.org/classifier/cgi/co...,128,100.0,Mine production of silver,1995,kilograms (metal content),Production,"silver, mine"
2,Burkina Faso,,,BF,Silver,,BFA,"Silver (mine production, metal content)",,2008-01-01T00:00:00,...,,12.591396,http://resource.geosciml.org/classifier/cgi/co...,128,800.0,Mine production of silver,1995,kilograms (metal content),Production,"silver, mine"


## Scanning Databade for number of Minerals

In [14]:
# This flips through every "page" of the BGS database
base_url = "https://ogcapi.bgs.ac.uk/collections/world-mineral-statistics/items"
all_minerals = set()
offset = 0
limit = 10000 

print("üöÄ Starting full database scan... mapping every mineral name.")

while True:
    params = {'f': 'json', 'limit': limit, 'offset': offset, 'bgs_statistic_type_trans' : 'Production'}
    try:
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            break
            
        data = response.json()
        features = data.get('features', [])
        
        if not features:
            break  # We reached the end of the database
            
        # Add every mineral name found in this batch to our set
        for f in features:
            name = f['properties'].get('erml_group')
            if name:
                all_minerals.add(name)
        
        print(f"Checking records {offset} to {offset + len(features)}...")
        offset += limit
        
    except Exception as e:
        print(f"Connection interrupted: {e}")
        break

# Sort them alphabetically for a clean presentation
final_list = sorted(list(all_minerals))

print("\n‚úÖ SCAN COMPLETE!")
print(f"Found {len(final_list)} unique Mineral Groups.")
print("-" * 30)
print(final_list)

üöÄ Starting full database scan... mapping every mineral name.
Checking records 0 to 10000...
Checking records 10000 to 20000...
Checking records 20000 to 30000...
Checking records 30000 to 40000...
Checking records 40000 to 50000...
Checking records 50000 to 60000...
Checking records 60000 to 70000...
Checking records 70000 to 80000...
Checking records 80000 to 90000...
Checking records 90000 to 100000...
Checking records 100000 to 110000...
Checking records 110000 to 120000...
Checking records 120000 to 130000...
Checking records 130000 to 130128...

‚úÖ SCAN COMPLETE!
Found 61 unique Mineral Groups.
------------------------------
['Aggregates and related materials', 'Antimony', 'Arsenic', 'Asbestos', 'Barytes', 'Bauxite, alumina and aluminium', "Bentonite and fuller's earth", 'Beryllium', 'Bismuth', 'Borates', 'Bromine', 'Cadmium', 'Cement', 'Chromium', 'Cobalt', 'Copper', 'Diamond', 'Diatomite', 'Feldspar', 'Fluorspar', 'Gallium', 'Germanium', 'Gold', 'Graphite', 'Gypsum', 'Indium

## Master ETL (Extract, Transform, Load)

In [19]:
%load_ext line_profiler
def build_master_database():
    db_name = "BGS_Full_Archive_Master.db"
    base_url = "https://ogcapi.bgs.ac.uk/collections/world-mineral-statistics/items"
    
    # Connect to the database file
    conn = sqlite3.connect(db_name)
    offset = 0
    limit = 5000 
    total_expected = 408480
    
    print(f"üì¶ Starting Master Harvest...")

    while True:
        try:
            params = {'f': 'json', 'limit': limit, 'offset': offset, 'bgs_statistic_type_trans' : 'Production'}
            # We already imported 'requests' in Block 1, so we just use it!
            response = requests.get(base_url, params=params, timeout=60)
            
            if response.status_code != 200:
                print(f"‚ö†Ô∏è Server Busy (Status {response.status_code}). Retrying in 5 seconds...")
                #time.sleep(5)
                continue

            data = response.json()
            features = data.get('features', [])
            
            if not features:
                break
            
            # Use 'pd' which we imported in Block 1
            df_batch = pd.DataFrame([f['properties'] for f in features])
            
            # Clean the year but keep original columns
            if 'year' in df_batch.columns:
                df_batch['year_clean'] = df_batch['year'].astype(str).str[:4]
            
            # Save batch to the SQL table
            df_batch.to_sql('FullMineralData', conn, if_exists='append', index=False)
            
            offset += len(features)
            print(f"‚úÖ Secured {offset} / {total_expected} rows...")
            
            #time.sleep(0.5) 
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error at row {offset}: {e}")
            break

    # Add the "Speed Boost" indexes
    print("‚ö° Creating high-speed indexes...")
    conn.execute("CREATE INDEX idx_mineral_group ON FullMineralData(erml_group);")
    conn.execute("CREATE INDEX idx_year_clean ON FullMineralData(year_clean);")
    
    conn.close()
    print(f"üéä FINISHED! Your master tool is ready.")

# Run the function
%lprun -f build_master_database()

üì¶ Starting Master Harvest...
‚úÖ Secured 5000 / 408480 rows...
‚úÖ Secured 10000 / 408480 rows...
‚úÖ Secured 15000 / 408480 rows...
‚úÖ Secured 20000 / 408480 rows...
‚úÖ Secured 25000 / 408480 rows...
‚úÖ Secured 30000 / 408480 rows...
‚úÖ Secured 35000 / 408480 rows...
‚úÖ Secured 40000 / 408480 rows...
‚úÖ Secured 45000 / 408480 rows...
‚úÖ Secured 50000 / 408480 rows...
‚úÖ Secured 55000 / 408480 rows...
‚úÖ Secured 60000 / 408480 rows...
‚úÖ Secured 65000 / 408480 rows...
‚úÖ Secured 70000 / 408480 rows...
‚úÖ Secured 75000 / 408480 rows...
‚úÖ Secured 80000 / 408480 rows...
‚úÖ Secured 85000 / 408480 rows...
‚úÖ Secured 90000 / 408480 rows...
‚úÖ Secured 95000 / 408480 rows...
‚úÖ Secured 100000 / 408480 rows...
‚úÖ Secured 105000 / 408480 rows...
‚úÖ Secured 110000 / 408480 rows...
‚úÖ Secured 115000 / 408480 rows...
‚úÖ Secured 120000 / 408480 rows...
‚úÖ Secured 125000 / 408480 rows...
‚úÖ Secured 130000 / 408480 rows...
‚ö†Ô∏è Error at row 130000: table FullMineralData ha

  profile = LineProfiler(*funcs)


Timer unit: 1e-09 s

In [20]:
# 1. Connect to your database
conn = sqlite3.connect("BGS_Full_Archive_Master.db")

try:
    # 2. Add the missing 'shape' column manually so SQLite is ready for it
    print("üõ† Adding 'shape' column to table...")
    conn.execute("ALTER TABLE FullMineralData ADD COLUMN shape TEXT;")
    conn.commit()
except sqlite3.OperationalError:
    print("‚ÑπÔ∏è Column 'shape' already exists, moving to data retrieval.")

# 3. Fetch the final missing batch (from row 405,000 to the end)
print("üì• Fetching the final missing batch...")
base_url = "https://ogcapi.bgs.ac.uk/collections/world-mineral-statistics/items"
params = {'f': 'json', 'limit': 5000, 'offset': 405000}
response = requests.get(base_url, params=params, timeout=60)
data = response.json()
features = data.get('features', [])

if features:
    # Convert to DataFrame
    df_final = pd.DataFrame([f['properties'] for f in features])
    
    # Add our custom year_clean column
    if 'year' in df_final.columns:
        df_final['year_clean'] = df_final['year'].astype(str).str[:4]
    
    # Save the final batch
    df_final.to_sql('FullMineralData', conn, if_exists='append', index=False)
    print(f"‚úÖ Success! Added the final {len(features)} rows.")

# 4. Final Cleanup: Rename to the professional name
print("üè∑ Renaming table to 'BGS_Global'...")
conn.execute("ALTER TABLE FullMineralData RENAME TO BGS_Global;")

# 5. Final Row Count Check
cursor = conn.execute("SELECT COUNT(*) FROM BGS_Global;")
final_count = cursor.fetchone()[0]
print(f"üìä FINAL VERIFICATION: {final_count:,} rows secured in 'BGS_Global'.")

conn.close()

üõ† Adding 'shape' column to table...
üì• Fetching the final missing batch...
‚úÖ Success! Added the final 3480 rows.
üè∑ Renaming table to 'BGS_Global'...
üìä FINAL VERIFICATION: 133,480 rows secured in 'BGS_Global'.


In [12]:
import os
import gc

# 1. Manually trigger garbage collection to clear stray objects
gc.collect()

# 2. Try to close the specific variables if they exist in this session
try:
    if 'conn' in locals():
        conn.close()
        print("üîå Connection closed.")
except NameError:
    pass

# 3. Now try the rename again
old_name = "BGS_Full_Archive_Master.db"
new_name = "World_Mineral_Archive.db"

if os.path.exists(old_name):
    try:
        os.rename(old_name, new_name)
        print(f"‚úÖ Success! Renamed to: {new_name}")
    except PermissionError:
        print("‚ùå Still locked! Try clicking 'Kernel' -> 'Restart' in your notebook menu.")
else:
    print("üìÇ File already renamed or not found.")

üîå Connection closed.
‚úÖ Success! Renamed to: World_Mineral_Archive.db


In [13]:
import requests
import sqlite3

# Define the next starting point after your current data
next_offset = 408480 
api_url = "https://ogcapi.bgs.ac.uk/collections/world-mineral-statistics/items"

print(f"üîç Auditing BGS API for records beyond {next_offset:,}...")

try:
    # Request just 1 row at the next offset
    response = requests.get(api_url, params={'f': 'json', 'limit': 1, 'offset': next_offset}, timeout=30)
    data = response.json()
    extra_features = data.get('features', [])

    if not extra_features:
        print(f"üéØ MATCH! The API is empty at offset {next_offset:,}.")
        print(f"‚úÖ Your local database is 100% synchronized with the British Geological Survey.")
    else:
        print(f"‚ö†Ô∏è DISCREPANCY! The API found a record at {next_offset:,}.")
        print("This means the BGS database grew while we were harvesting. Run a final mini-harvest!")

except Exception as e:
    print(f"‚ùå Verification failed: {e}")

üîç Auditing BGS API for records beyond 408,480...
üéØ MATCH! The API is empty at offset 408,480.
‚úÖ Your local database is 100% synchronized with the British Geological Survey.
