<a href="https://colab.research.google.com/github/segmue/GIR_Project/blob/main/Irchel_Geoparser_ACLED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import os
import pickle

drive.mount('/content/drive', force_remount = True)

# Path to the folder containing the CSV files
data_folder = '/content/drive/MyDrive/GEO871'  # Replace with the actual path

def load_files(data_folder):
  dataframes = {}
  for filename in os.listdir(data_folder):
      if filename.endswith(".csv"):
          filepath = os.path.join(data_folder, filename)
          try:
              df = pd.read_csv(filepath)
              dataframes[filename] = df
              print(f"Successfully loaded '{filename}' into a DataFrame.")
          except Exception as e:
              print(f"Error loading '{filename}': {e}")
  return dataframes


def split_batches(df, batchsize):
  batches = {}
  num_batches = (len(df) + batchsize - 1) // batchsize  # Calculate the number of batches
  for i in range(num_batches):
      start_index = i * batchsize
      end_index = min((i + 1) * batchsize, len(df))
      batches[i] = df.iloc[start_index:end_index]
  return batches

def write_df_to_drive(df, filename):
  """Writes a DataFrame to a pickle file in the Google Drive data folder.

  Args:
    df: The DataFrame to write.
    filename: The name of the pickle file (including the .pkl extension).
  """
  filepath = os.path.join(data_folder, filename)
  try:
    with open(filepath, 'wb') as f:
      pickle.dump(df, f)
    print(f"DataFrame successfully written to '{filepath}'")
  except Exception as e:
    print(f"Error writing DataFrame to '{filepath}': {e}")

Mounted at /content/drive


In [2]:
%%capture
!pip install geoparser==0.1.8
!python -m spacy download en_core_web_sm
!python -m geoparser download geonames

In [3]:

from geoparser import Geoparser
geo = Geoparser(spacy_model='en_core_web_sm', transformer_model='dguzh/geo-all-MiniLM-L6-v2', gazetteer='geonames')

def geoparse_df_notes(df):
    # Apply geo.parse to the entire "notes" column at once
    parsed_rows = geo.parse(df["notes"].tolist())

    # Initialize lists to store results
    toponyms_col = []
    locations_col = []
    res_list_col = []

    # Process each parsed_row
    for parsed_row in parsed_rows:
        toponyms = []
        locations = []
        res_list = []

        for t in parsed_row.toponyms:
            toponyms.append(str(t))
            t_loc = t.location
            locations.append(t_loc)

            try:
                t_lat = t_loc.get("latitude", None) if t_loc else None
                t_long = t_loc.get("longitude", None) if t_loc else None
            except AttributeError:
                t_lat = None
                t_long = None

            subres = {str(t): {"latitude": t_lat, "longitude": t_long}}
            res_list.append(subres)

        # Append results for this row
        toponyms_col.append(toponyms)
        locations_col.append(locations)
        res_list_col.append(res_list)

    # Add results to the DataFrame
    df["geoparser_locations"] = locations_col
    df["geoparser_toponyms"] = toponyms_col
    df["geoparser_res"] = res_list_col

    return df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [89]:

import pandas as pd
import os
import re

def load_df_from_drive(file_pattern):
    """Loads all .pkl files matching a pattern from Google Drive into a DataFrame.

    Args:
        file_pattern: The file pattern to match (e.g., "Europe-Central-Asia_2018-2024_Nov22_").

    Returns:
        A pandas DataFrame containing the concatenated data from all matching files, or None if no files match or an error occurs.
    """
    all_dfs = []
    for filename in os.listdir(data_folder):
      if filename.endswith("final.pkl"):
        continue

      if filename.endswith(".pkl") and re.search(file_pattern, filename):
            filepath = os.path.join(data_folder, filename)
            try:
                with open(filepath, 'rb') as f:
                    df = pickle.load(f)
                    all_dfs.append(df)
                    print(f"Loaded: {filename}")
            except (FileNotFoundError, pickle.UnpicklingError) as e:
                print(f"Error loading {filename}: {e}")
                return None

    if not all_dfs:
        print(f"No files found matching the pattern '{file_pattern}'")
        return None

    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df


import pandas as pd

def generate_geoparser_res(row):
    geoparser_locations = row.get("geoparser_locations", None)
    geoparser_toponyms = row.get("geoparser_toponyms", None)

    # Initialize result dictionary
    res = {}

    # Handle cases where geoparser_toponyms is not valid
    if not isinstance(geoparser_toponyms, (list, tuple)):
        return res

    # Process each toponym and location data
    for i, toponym in enumerate(geoparser_toponyms):
      location_data = geoparser_locations.data[i]

        # Add latitude and longitude if valid
      if location_data and "latitude" in location_data and "longitude" in location_data:
            subres = {toponym: {
                "latitude": location_data["latitude"],
                "longitude": location_data["longitude"]
            }}
            res.update(subres)

    return res



In [None]:
df = load_df_from_drive("Europe-Central-Asia_2018-2024_Nov22_")
df["geoparser_res"] = df.apply(generate_geoparser_res, axis=1)
df = df.drop(columns=["geoparser_locations", "geoparser_toponyms"])
write_df_to_drive(df, "Europe-Central-Asia_2018-2024_Nov22_final.pkl")


df = load_df_from_drive("Europe-Central-Asia_2018-2024_Nov22_")
df["geoparser_res"] = df.apply(generate_geoparser_res, axis=1)
df = df.drop(columns=["geoparser_locations", "geoparser_toponyms"])
write_df_to_drive(df, "Europe-Central-Asia_2018-2024_Nov22_final.pkl")

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_final.pkl'


In [4]:
data = load_files(data_folder)


Successfully loaded 'Europe-Central-Asia_2018-2024_Nov22.csv' into a DataFrame.


In [5]:
for filename, df in data.items():
  batches = split_batches(df, 10000)
  data[filename] = batches

In [None]:
for filename, df in data.items():
  for batch_num, batch_df in df.items():
    if filename == 'Europe-Central-Asia_2018-2024_Nov22.csv' and int(batch_num) <= 30:
      print(f"Skipping '{filename}' batch {batch_num}")
      continue
    result = geoparse_df_notes(batch_df)
    write_filename = filename[:-4] + "_" + str(batch_num) + ".pkl"
    write_df_to_drive(result, write_filename)
    print(f"Successfully wrote '{write_filename}'")


Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 0
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 1
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 2
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 3
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 4
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 5
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 6
Skipping 'Europe-Central-Asia_2018-2024_Nov22.csv' batch 7
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4331 [00:00<?, ?it/s]

Batches:   0%|          | 0/1450 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_8.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_8.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/2675 [00:00<?, ?it/s]

Batches:   0%|          | 0/1440 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_9.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_9.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3754 [00:00<?, ?it/s]

Batches:   0%|          | 0/1548 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_10.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_10.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3418 [00:00<?, ?it/s]

Batches:   0%|          | 0/1517 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_11.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_11.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3789 [00:00<?, ?it/s]

Batches:   0%|          | 0/1444 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_12.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_12.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3491 [00:00<?, ?it/s]

Batches:   0%|          | 0/1572 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_13.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_13.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4055 [00:00<?, ?it/s]

Batches:   0%|          | 0/1467 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_14.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_14.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4913 [00:00<?, ?it/s]

Batches:   0%|          | 0/1452 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_15.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_15.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3908 [00:00<?, ?it/s]

Batches:   0%|          | 0/1492 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_16.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_16.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3688 [00:00<?, ?it/s]

Batches:   0%|          | 0/1595 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_17.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_17.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3830 [00:00<?, ?it/s]

Batches:   0%|          | 0/1451 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_18.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_18.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3892 [00:00<?, ?it/s]

Batches:   0%|          | 0/1459 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_19.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_19.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4310 [00:00<?, ?it/s]

Batches:   0%|          | 0/1514 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_20.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_20.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3095 [00:00<?, ?it/s]

Batches:   0%|          | 0/1182 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_21.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_21.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/3968 [00:00<?, ?it/s]

Batches:   0%|          | 0/1173 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_22.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_22.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4195 [00:00<?, ?it/s]

Batches:   0%|          | 0/1352 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_23.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_23.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/5420 [00:00<?, ?it/s]

Batches:   0%|          | 0/1689 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_24.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_24.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/5325 [00:00<?, ?it/s]

Batches:   0%|          | 0/1827 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_25.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_25.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4802 [00:00<?, ?it/s]

Batches:   0%|          | 0/1418 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_26.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_26.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/5951 [00:00<?, ?it/s]

Batches:   0%|          | 0/1577 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_27.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_27.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4886 [00:00<?, ?it/s]

Batches:   0%|          | 0/1565 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_28.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_28.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4853 [00:00<?, ?it/s]

Batches:   0%|          | 0/1645 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_29.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_29.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/5207 [00:00<?, ?it/s]

Batches:   0%|          | 0/1489 [00:00<?, ?it/s]

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_30.pkl'
Successfully wrote 'Europe-Central-Asia_2018-2024_Nov22_30.pkl'
Toponym Recognition...


Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/4671 [00:00<?, ?it/s]

In [None]:
strs = ["my first reference is Zurich, in Switzerland", "my second is katwa in Kongo"]

In [None]:
parsed_text = geo.parse(strs)

In [None]:
a = [str(a) for a in parsed_text.toponyms]
b = parsed_text.locations

In [None]:
for r in parsed_text:
  print(r)