<a href="https://colab.research.google.com/github/steffenvogler/LLM2GPS/blob/main/WIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
pip install --upgrade --quiet geopy google-api-python-client google-cloud-aiplatform langchain-core google-generativeai langchain-google-genai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/158.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m122.9/158.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# # Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

## Setting everything up

In [14]:
import getpass
import pathlib
import textwrap

import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI

# Used to securely store your API key
from google.colab import userdata

from IPython.display import display
from IPython.display import Markdown

from geopy.geocoders import Nominatim
from geopy import distance
geolocator = Nominatim(user_agent="LLM2GPS") # Many other geocoders available via GeoPy;  https://geopy.readthedocs.io/en/stable/#module-geopy.geocoders

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

## Enter API Key generated in a private GCP account

In [2]:
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API Key")

genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

Enter API Key··········


## See which models are available

In [15]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


## Selecting a model
Overview at https://console.cloud.google.com/vertex-ai/model-garden

In [18]:
# LLM model
llm = ChatGoogleGenerativeAI(
    model="gemini-1.0-pro",
    max_output_tokens=256,
    temperature=0.1,
    top_p=0.8,
    top_k=5,
    verbose=True,
)

## Sanity test

In [19]:
result = llm.invoke("Tell a dad joke that contains the word dinosaur. Later provide German translation")
to_markdown(result.content)

> **Dad joke:**
> Why did the dinosaur cross the road?
> 
> To get to the other s-ide!
> 
> **German translation:**
> **Papa-Witz:**
> Warum hat der Dinosaurier die Straße überquert?
> 
> Um auf die andere S-eite zu kommen!

In [20]:
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate

In [86]:
template = """Your job is to guess the geographic loaction based on the description and keywords.
% USER input
{user_input}

YOUR RESPONSE:
"""
prompt_template = PromptTemplate(input_variables=["user_input"], template=template)

# Holds my 'location' chain
location_chain = LLMChain(llm=llm, prompt=prompt_template)

template = """Given a location, provide a structured output that starts with the continent, then country, federal state, then administrative district or city (if applicable). Everything separated by a whitespace. Only provide a single line of output. Never output any additional text. If there are more than one places that match the description or if you are not sure, add an asterisk sign ad the end of the response.
% USER geo
{user_geo}

YOUR RESPONSE:
"""
prompt_template = PromptTemplate(input_variables=["user_geo"], template=template)

# Holds my 'json' chain
geo_chain = LLMChain(llm=llm, prompt=prompt_template)

overall_chain = SimpleSequentialChain(chains=[location_chain, geo_chain], verbose=False)

## Orchestrating the LLM query and responses

In [77]:
def response_processing(location, i):
  if (location is None):
    #print (i)
    if i == 4:
      return
    #print("No location found")
    if i == 3:
      improved_query = query
    elif i == 2:
      improved_query = str(overall_chain.invoke(query))
    else:
      improved_query = str(overall_chain.run(query) + " " + query)
    location = geolocator.geocode(improved_query)
    i = i + 1
    response_processing(location,i)
  else:
    # try:
    #   print(improved_query)
    # except:
    #   print(query)
    print("GPS-coordinates: ({},{})".format(location.latitude, location.longitude))
    print("Address according to geocoding service: {}".format(location.address))
    xy0 = (location.raw['boundingbox'][0], location.raw['boundingbox'][2])
    xy1 = (location.raw['boundingbox'][1], location.raw['boundingbox'][3])
    print("Great circle distance in km: {}".format(round(distance.great_circle(xy0, xy1).km,2)))
    east_west_dist = haversine(location.raw['boundingbox'][0], location.raw['boundingbox'][2], location.raw['boundingbox'][0], location.raw['boundingbox'][3])
    north_south_dist = haversine(location.raw['boundingbox'][0], location.raw['boundingbox'][2], location.raw['boundingbox'][1], location.raw['boundingbox'][2])
    print("north_south_distance: {} and east-west-distance: {} (all in km)".format(north_south_dist, east_west_dist))


In [78]:
import math

def haversine(lat1, lon1, lat2, lon2):
  """Calculates the great circle distance between two points on a sphere.

  Args:
    lat1: The latitude of the first point in degrees.
    lon1: The longitude of the first point in degrees.
    lat2: The latitude of the second point in degrees.
    lon2: The longitude of the second point in degrees.

  Returns:
    The great circle distance between the two points in kilometers.
  """

  R = 6371  # Earth's radius in kilometers

  dlat = math.radians(float(lat2) - float(lat1))
  dlon = math.radians(float(lon2) - float(lon1))
  a = math.sin(dlat / 2) ** 2 + math.cos(math.radians(float(lat1))) * math.cos(math.radians(float(lat2))) * math.sin(dlon / 2) ** 2
  c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
  d = R * c

  return d

In [139]:
query = "Spreewald" # @param ["Berlin", "Copenhagen Zoo", "Spreewald", "Gjoa Haven", "Mt. Cook", "Zugspitze", "Sandwich island", "the capital of the country where Kiwis live"] {allow-input: true}

In [140]:
improved_query = str(query + " " + overall_chain.run(query))

location = geolocator.geocode(improved_query)
print("original query: {}; LLM-improved query: {}".format(query,improved_query))

i = 0

response_processing(location, i)

original query: Spreewald; LLM-improved query: Spreewald Europe Germany
GPS-coordinates: (51.9532976,13.884878363643075)
Address according to geocoding service: Spreewald, Dahme-Spreewald, Brandenburg, Deutschland
Great circle distance in km: 52.8
north_south_distance: 41.025568336377674 and east-west-distance: 33.376934661327994 (all in km)


## Now same concept used for enriching a public dataset
Download zip file from https://doi.org/10.15468/dl.6qs64w

In [133]:
import requests, zipfile, io
zip_file_url = "https://occurrence-download.gbif.org/occurrence/download/request/0032930-231002084531237.zip"

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

download_url(zip_file_url,'sample_data/test.zip')
!unzip -o 'sample_data/test.zip'

Archive:  sample_data/test.zip
  inflating: 0032930-231002084531237.csv  


In [110]:
import pandas as pd

In [113]:
df = pd.read_csv("0032930-231002084531237.csv", sep='\t')

  df = pd.read_csv("0032930-231002084531237.csv", sep='\t')


In [116]:
df.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'verbatimScientificName',
       'verbatimScientificNameAuthorship', 'countryCode', 'locality',
       'stateProvince', 'occurrenceStatus', 'individualCount',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue'],
      dtype='object')

In [129]:
locality_list = list(set(df.locality.to_list()))

Assumption is that LLM can translate between human natural language and description that can be understood by Geocoding Service

In [130]:
locality_list[:10]

['Río Atabapo bei San Fernando',
 'Ferchensee, Mittenwald',
 'Cadix',
 'Sümpfe 10 km se. von Rovinji',
 'Housten',
 "Z.-Pazifik,, kleine Inselgruppe südl. Hawai'i Inseln, ,",
 'Rutuno, Oriente',
 'Ych-tan (heute: ... )',
 'Arussi Gallaland, Webi Shebelli, (Shebelli River)',
 'Bilck a.d. Trebesnica']

##Pseudocode below

In [None]:
for each row:
  # create new column and fill with new AI-derived data
  df["AI_derived_geotag"] = {function using LLM plus geocoding}