<a href="https://colab.research.google.com/github/segmue/GIR_Project/blob/main/Irchel_Geoparser_ACLED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
import pandas as pd
import os
import pickle

drive.mount('/content/drive', force_remount = True)

# Path to the folder containing the CSV files
data_folder = '/content/drive/MyDrive/GEO871'  # Replace with the actual path

def load_files(data_folder):
  dataframes = {}
  for filename in os.listdir(data_folder):
      if filename.endswith(".csv"):
          filepath = os.path.join(data_folder, filename)
          try:
              df = pd.read_csv(filepath)
              dataframes[filename] = df
              print(f"Successfully loaded '{filename}' into a DataFrame.")
          except Exception as e:
              print(f"Error loading '{filename}': {e}")
  return dataframes


def split_batches(df, batchsize):
  batches = {}
  num_batches = (len(df) + batchsize - 1) // batchsize  # Calculate the number of batches
  for i in range(num_batches):
      start_index = i * batchsize
      end_index = min((i + 1) * batchsize, len(df))
      batches[i] = df.iloc[start_index:end_index]
  return batches

def write_df_to_drive(df, filename):
  """Writes a DataFrame to a pickle file in the Google Drive data folder.

  Args:
    df: The DataFrame to write.
    filename: The name of the pickle file (including the .pkl extension).
  """
  filepath = os.path.join(data_folder, filename)
  try:
    with open(filepath, 'wb') as f:
      pickle.dump(df, f)
    print(f"DataFrame successfully written to '{filepath}'")
  except Exception as e:
    print(f"Error writing DataFrame to '{filepath}': {e}")

Mounted at /content/drive


In [None]:
%%capture
!pip install geoparser==0.1.8
!python -m spacy download en_core_web_sm
!python -m geoparser download geonames

In [None]:

from geoparser import Geoparser
geo = Geoparser(spacy_model='en_core_web_sm', transformer_model='dguzh/geo-all-MiniLM-L6-v2', gazetteer='geonames')

def geoparse_df_notes(df):
    # Apply geo.parse to the entire "notes" column at once
    parsed_rows = geo.parse(df["notes"].tolist())

    # Initialize lists to store results
    toponyms_col = []
    locations_col = []
    res_list_col = []

    # Process each parsed_row
    for parsed_row in parsed_rows:
        toponyms = []
        locations = []
        res_list = []

        for t in parsed_row.toponyms:
            toponyms.append(str(t))
            t_loc = t.location
            locations.append(t_loc)

            try:
                t_lat = t_loc.get("latitude", None) if t_loc else None
                t_long = t_loc.get("longitude", None) if t_loc else None
            except AttributeError:
                t_lat = None
                t_long = None

            subres = {str(t): {"latitude": t_lat, "longitude": t_long}}
            res_list.append(subres)

        # Append results for this row
        toponyms_col.append(toponyms)
        locations_col.append(locations)
        res_list_col.append(res_list)

    # Add results to the DataFrame
    df["geoparser_locations"] = locations_col
    df["geoparser_toponyms"] = toponyms_col
    df["geoparser_res"] = res_list_col

    return df

In [None]:
data = load_files(data_folder)


In [None]:
for filename, df in data.items():
  batches = split_batches(df, 10000)
  data[filename] = batches

In [None]:
for filename, df in data.items():
  for batch_num, batch_df in df.items():
    if filename == 'Europe-Central-Asia_2018-2024_Nov22.csv': # and int(batch_num) <= 30:
      print(f"Skipping '{filename}' batch {batch_num}")
      continue
    result = geoparse_df_notes(batch_df)
    write_filename = filename[:-4] + "_" + str(batch_num) + ".pkl"
    write_df_to_drive(result, write_filename)
    print(f"Successfully wrote '{write_filename}'")
