<a href="https://colab.research.google.com/github/segmue/GIR_Project/blob/main/Irchel_Geoparser_ACLED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [127]:
# prompt: Mount google drive and load all csvs in the folder GEO876/data/ into panda dataframes

from google.colab import drive
import pandas as pd
import os
import pickle

drive.mount('/content/drive', force_remount = True)

# Path to the folder containing the CSV files
data_folder = '/content/drive/MyDrive/GEO871'  # Replace with the actual path

def load_files(data_folder):
# List to store dataframes
  dataframes = {}
  # Iterate through files in the directory
  for filename in os.listdir(data_folder):
      if filename.endswith(".csv"):
          filepath = os.path.join(data_folder, filename)
          try:
              # Read the CSV file into a pandas DataFrame
              df = pd.read_csv(filepath)
              dataframes[filename] = df
              print(f"Successfully loaded '{filename}' into a DataFrame.")
          except Exception as e:
              print(f"Error loading '{filename}': {e}")
  return dataframes


def split_batches(df, batchsize):
  batches = {}
  num_batches = (len(df) + batchsize - 1) // batchsize  # Calculate the number of batches
  for i in range(num_batches):
      start_index = i * batchsize
      end_index = min((i + 1) * batchsize, len(df))  # Handle the last batch
      batches[i] = df.iloc[start_index:end_index]
  return batches

def write_df_to_drive(df, filename):
  """Writes a DataFrame to a pickle file in the Google Drive data folder.

  Args:
    df: The DataFrame to write.
    filename: The name of the pickle file (including the .pkl extension).
  """
  filepath = os.path.join(data_folder, filename)
  try:
    with open(filepath, 'wb') as f:
      pickle.dump(df, f)
    print(f"DataFrame successfully written to '{filepath}'")
  except Exception as e:
    print(f"Error writing DataFrame to '{filepath}': {e}")

Mounted at /content/drive


In [128]:
%%capture
!pip install geoparser==0.1.8
!python -m spacy download en_core_web_sm
!python -m geoparser download geonames

In [129]:

from geoparser import Geoparser
geo = Geoparser(spacy_model='en_core_web_sm', transformer_model='dguzh/geo-all-MiniLM-L6-v2', gazetteer='geonames')

def geoparse_df_notes(df):
    # Initialize lists to store results
    toponyms_col = []
    locations_col = []
    res_list_col = []

    # Process each row individually, somehow the geoparser doesn't work very well otherwise (e.g. list of strings)
    for _, row in df.iterrows():
        toponyms = []
        locations = []
        res_list = []

        # Parse the "notes" column for the current row
        parsed_row = geo.parse([row["notes"]])[0]

        for t in parsed_row.toponyms:
          toponyms.append(str(t))
          t_loc = t.location
          locations.append(t_loc)

          t_lat = t.location.get("latitude", None)
          t_long = t.location.get("longitude", None)

          subres = {str(t): {"latitude": t_lat, "longitude": t_long}}
          res_list.append(subres)

        # # Extract toponyms
        # entry_toponyms = [str(a) for a in parsed_row.toponyms


        # # Extract toponyms
        # entry_toponyms = [str(a) for a in parsed_row.toponyms]
        # toponyms.append(entry_toponyms)

        # # Extract location data
        # locdata = parsed_row.locations.data
        # locations.append(locdata)

        # Build res dictionary
        res = {}
        for i, toponym in enumerate(entry_toponyms):
            if locdata and i < len(locdata) and locdata[i] is not None:
                subres = {toponym: {"latitude": locdata[i].latitude, "longitude": locdata[i].longitude}}
            else:
                subres = {toponym: {"latitude": None, "longitude": None}}
            res.update(subres)
        res_list.append(res)

    # Add results to the DataFrame
    df["geoparser_locations"] = locations
    df["geoparser_toponyms"] = toponyms
    df["geoparser_res"] = res_list

    return df

In [89]:

import pandas as pd
import os
import re

def load_df_from_drive(file_pattern):
    """Loads all .pkl files matching a pattern from Google Drive into a DataFrame.

    Args:
        file_pattern: The file pattern to match (e.g., "Europe-Central-Asia_2018-2024_Nov22_").

    Returns:
        A pandas DataFrame containing the concatenated data from all matching files, or None if no files match or an error occurs.
    """
    all_dfs = []
    for filename in os.listdir(data_folder):
      if filename.endswith("final.pkl"):
        continue

      if filename.endswith(".pkl") and re.search(file_pattern, filename):
            filepath = os.path.join(data_folder, filename)
            try:
                with open(filepath, 'rb') as f:
                    df = pickle.load(f)
                    all_dfs.append(df)
                    print(f"Loaded: {filename}")
            except (FileNotFoundError, pickle.UnpicklingError) as e:
                print(f"Error loading {filename}: {e}")
                return None

    if not all_dfs:
        print(f"No files found matching the pattern '{file_pattern}'")
        return None

    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df


import pandas as pd

def generate_geoparser_res(row):
    geoparser_locations = row.get("geoparser_locations", None)
    geoparser_toponyms = row.get("geoparser_toponyms", None)

    # Initialize result dictionary
    res = {}

    # Handle cases where geoparser_toponyms is not valid
    if not isinstance(geoparser_toponyms, (list, tuple)):
        return res

    # Process each toponym and location data
    for i, toponym in enumerate(geoparser_toponyms):
      location_data = geoparser_locations.data[i]

        # Add latitude and longitude if valid
      if location_data and "latitude" in location_data and "longitude" in location_data:
            subres = {toponym: {
                "latitude": location_data["latitude"],
                "longitude": location_data["longitude"]
            }}
            res.update(subres)

    return res



In [None]:
df = load_df_from_drive("Europe-Central-Asia_2018-2024_Nov22_")
df["geoparser_res"] = df.apply(generate_geoparser_res, axis=1)
df = df.drop(columns=["geoparser_locations", "geoparser_toponyms"])
write_df_to_drive(df, "Europe-Central-Asia_2018-2024_Nov22_final.pkl")


df = load_df_from_drive("Europe-Central-Asia_2018-2024_Nov22_")
df["geoparser_res"] = df.apply(generate_geoparser_res, axis=1)
df = df.drop(columns=["geoparser_locations", "geoparser_toponyms"])
write_df_to_drive(df, "Europe-Central-Asia_2018-2024_Nov22_final.pkl")

DataFrame successfully written to '/content/drive/MyDrive/GEO871/Europe-Central-Asia_2018-2024_Nov22_final.pkl'


In [130]:
data = load_files(data_folder)


Successfully loaded 'Europe-Central-Asia_2018-2024_Nov22.csv' into a DataFrame.


In [131]:
for filename, df in data.items():
  batches = split_batches(df, 10000)
  data[filename] = batches

In [132]:
for filename, df in data.items():
  for batch_num, batch_df in df.items():
    if filename == 'Europe-Central-Asia_2018-2024_Nov22.csv' and batch_num <= -1:
      print(f"Skipping '{filename}' batch {batch_num}")
      continue
    result = geoparse_df_notes(batch_df)
    write_filename = filename[:-4] + "_" + str(batch_num) + ".pkl"
    write_df_to_drive(result, write_filename)
    print(f"Successfully wrote '{write_filename}'")


Toponym Recognition...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

AttributeError: 'dict' object has no attribute 'latitude'

In [None]:
strs = ["my first reference is Zurich, in Switzerland", "my second is katwa in Kongo"]

In [None]:
parsed_text = geo.parse(strs)

In [None]:
a = [str(a) for a in parsed_text.toponyms]
b = parsed_text.locations

In [None]:
for r in parsed_text:
  print(r)