In [11]:
import pandas as pd
import numpy as np
import re

file_path = 'PDC Crew Movements - sorted by disemb.csv'
df = pd.read_csv(file_path)
df.rename(columns={'Last Name ': 'Last Name'}, inplace=True)

In [12]:
def extract_date(text):
    # Helper function to extract date in the format dd/mm/yyyy
    match = re.search(r'(\d{2}/\d{2}/\d{4})', text)
    if match:
        return pd.to_datetime(match.group(1), dayfirst=True)
    return None

def extract_disembark_loc(text):
    # Helper function to extract the disembark location from text and clean it up
    match = re.search(r'débarqué[es]? à ([A-Za-zÀ-ÿ\'-]+(?: [A-Za-zÀ-ÿ\'-]+)*)', text, flags=re.IGNORECASE)
    if match:
        place_name = match.group(1)
        # Remove trailing non-place words like "le", "la", "du", etc.
        place_name = re.sub(r'\b(?:le|la|du|en|aux)\b.*$', '', place_name, flags=re.IGNORECASE).strip()
        return place_name
    return None

def split_remarks(remarks):
    # Split remarks based on 'rembarqué' keyword to handle each segment separately
    parts = re.split(r'(rembarqué[es]?)', remarks, flags=re.IGNORECASE)
    segments = [parts[0]]  # First part before 'rembarqué'
    
    for i in range(1, len(parts) - 1, 2):
        segments.append(parts[i] + parts[i + 1])  # Combine 'rembarqué' with the following text
    
    return segments

def process_rembarque(df):
    new_rows = []

    # Filter rows that contain the "rembarqué" keyword in the Remarks column
    rembarque_rows = df[df['Remarks'].str.contains('rembarqué', na=False, case=False)]

    for index, row in rembarque_rows.iterrows():
        segments = split_remarks(row['Remarks'])

        # Process the first segment (before the first rembarqué)
        first_segment = segments[0]
        first_embark_date = row['Emb_date']  # Use original embark date
        first_disembark_date = extract_date(re.search(r'débarqué[es]?.*', first_segment, flags=re.IGNORECASE).group()) if re.search(r'débarqué[es]?.*', first_segment, flags=re.IGNORECASE) else None
        first_disembark_loc = extract_disembark_loc(first_segment)  # Extract disembark location

        original_row = row.copy()
        original_row['Emb_date'] = first_embark_date  # Keep original embark date
        original_row['Disemb_date'] = first_disembark_date if first_disembark_date else row['Disemb_date']
        original_row['Disemb_loc'] = first_disembark_loc if first_disembark_loc else row['Disemb_loc']
        original_row['Remarks'] = first_segment.strip()
        new_rows.append(original_row)

        previous_disembark_date = first_disembark_date

        # Process subsequent segments starting with "rembarqué"
        for i, segment in enumerate(segments[1:], start=1):
            # Handle "rembarqué le dit jour et an"
            if 'rembarqué le dit jour et an' in segment.lower():
                embark_date = previous_disembark_date
            else:
                embark_date = extract_date(re.search(r'rembarqué[es]?.*', segment, flags=re.IGNORECASE).group()) if re.search(r'rembarqué[es]?.*', segment, flags=re.IGNORECASE) else None

            disembark_date = extract_date(re.search(r'débarqué[es]?.*', segment, flags=re.IGNORECASE).group()) if re.search(r'débarqué[es]?.*', segment, flags=re.IGNORECASE) else None
            disembark_loc = extract_disembark_loc(segment)  # Extract disembark location

            new_row = row.copy()
            new_row['Emb_date'] = embark_date
            new_row['Disemb_date'] = disembark_date if disembark_date else row['Disemb_date']
            
            # Only the last segment should keep the original Disemb_loc; others should extract it
            if i == len(segments) - 1:
                new_row['Disemb_loc'] = row['Disemb_loc']
            else:
                new_row['Disemb_loc'] = disembark_loc if disembark_loc else row['Disemb_loc']

            new_row['Remarks'] = segment.strip()
            new_rows.append(new_row)

            # Update the previous disembark date
            previous_disembark_date = disembark_date if disembark_date else previous_disembark_date

    # Append the processed rows back to the original dataframe
    non_rembarque_rows = df[~df.index.isin(rembarque_rows.index)]
    new_df = pd.concat([non_rembarque_rows, pd.DataFrame(new_rows)], ignore_index=True)

    # Sort the dataframe by Last Name and Embark Date
    new_df = new_df.sort_values(by=['Last Name', 'Emb_date'])

    return new_df

# Apply the function and save the result
processed_df = process_rembarque(df)

df = processed_df
df

Unnamed: 0,n,Last Name,First Name,Function,Remarks,Age,Wage,Emb_date,Disemb_date,Disemb_loc,Fate,Left La Corunna by barque,Travel expenses
242,315.0,ABAS,,matelot lascar,supplément à Chandernagor du 01/02/1755 \n mor...,25.0,14.8,,08/09/1756,ration,supplément à Chandernagor du 01/02/1755 \n mor...,,
241,310.0,AGOUSTIN,Colas,matelot lascar,supplément à Chandernagor du 01/02/1755 \n dés...,35.0,14.8,,08/09/1756,ration,supplément à Chandernagor du 01/02/1755 \n dés...,,
265,363.0,AIGNAN DE LA MOTHE,Pierre,pilote passager,embarqué à l'île de France le 26/11/1755 \n dé...,,,26/11/1755,08/09/1756,La Corogne,,,
103,261.0,ALBRONS DIT SANSCHAGRIN,Claude,soldat passager,remplacement à l'armement du 09/03/1754 \n déb...,,7.1,09/03/1754,02/09/1754,pondichéry,,,
337,290.0,ANDERICTZ,Evert,matelot,remplacement à Cajory le 28/02/1755 \n déserté...,,21,28/02/1755,28/07/1756,corogne,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,119.0,,Joseph,valet [domestique],embarqué à l'armement \n débarqué à Chandernag...,25.0,20,09/03/1754,1755-01-22 00:00:00,Chandernagor,,,
295,362.0,,Jasmin,domestique passager,embarqué à l'île de France le 26/11/1755 \n dé...,,,26/11/1755,10/04/1756,la corogne,,,
297,367.0,,Devlia,domestique passager,embarqué à l'île de France le 26/11/1755 \n dé...,,,26/11/1755,12/04/1756,la corogne,,,
349,,,,,,,,,,,,,


In [13]:
df.to_csv('splitted.csv')

In [278]:
def extract_embark_loc(text):
    # Function to extract the embark location from text
    if isinstance(text, str):
        if "fait la campagne de" in text:
            match = re.search(r'\b(?:a|à) fait la campagne de ([\w\s\'-îÎ]+?) à', text, flags=re.IGNORECASE)
            if match:
                return match.group(1).strip()
        match = re.search(r'\bembarqué[es]? à ([\w\s\'-îÎ]+?)(?: le|\n|,| ---|$)', text, flags=re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

def extract_disembark_loc(text):
    # Function to extract the disembark location from text
    if isinstance(text, str):
        match = re.search(r'\bdébarqué[es]?.*?à ([\w\s\'-îÎ]+?)(?: le|\n|,| ---|$)', text, flags=re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

def process_reembark(df):
    # Step 1: Extract embark location from remarks
    df['emb_loc'] = df['Remarks'].apply(extract_embark_loc)
    
    # Step 2: Extract disembark location from remarks if disembark location is NaN
    df['Disemb_loc'] = df.apply(lambda row: extract_disembark_loc(row['Remarks']) if pd.isna(row['Disemb_loc']) else row['Disemb_loc'], axis=1)

    # Step 3: Convert all dates to a standard string format
    df['Emb_date'] = pd.to_datetime(df['Emb_date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Disemb_date'] = pd.to_datetime(df['Disemb_date'], errors='coerce').dt.strftime('%Y-%m-%d')

    # Step 4: Identify rows with the "rembarqué" keyword and where emb_loc is NaN
    reembark_rows = df[(df['Remarks'].str.contains('rembarqué', na=False, case=False)) & (df['emb_loc'].isna())]

    # Step 5: Process each row with the "rembarqué" keyword individually
    for index, row in reembark_rows.iterrows():
        # Find rows with matching 'Last Name', 'First Name', and 'Function', handling NaN values
        matching_rows = df[
            match_or_nan(df['Last Name'], row['Last Name']) &
            match_or_nan(df['First Name'], row['First Name']) &
            match_or_nan(df['Function'], row['Function'])
        ]

        # Sort the matching rows by Emb_date
        sorted_group = matching_rows.sort_values(by=['Emb_date', 'Disemb_date'])

        # Iterate through the sorted group to assign emb_loc based on previous Disemb_loc
        previous_disemb_loc = None
        for i, (sorted_index, sorted_row) in enumerate(sorted_group.iterrows()):
            if sorted_index == index:
                if previous_disemb_loc:
                    df.at[sorted_index, 'emb_loc'] = previous_disemb_loc

            # Update previous_disemb_loc for the next row in the sequence
            previous_disemb_loc = sorted_row['Disemb_loc'] 
            
    # Step 6: Fill disemb_loc based on next emb_loc if disemb_loc is NaN
    for _, group in df.groupby(['Last Name', 'First Name', 'Function'], dropna=False):
        sorted_group = group.sort_values(by=['Emb_date', 'Disemb_date'])
        for i, (index, row) in enumerate(sorted_group.iterrows()):
            if pd.isna(row['Disemb_loc']) and i < len(sorted_group) - 1:
                next_row = sorted_group.iloc[i + 1]
                df.at[index, 'Disemb_loc'] = next_row['emb_loc']

    return df

def match_or_nan(a, b):
    # Function to check if two values are equal or both are NaN
    return (a == b) | (pd.isna(a) & pd.isna(b))

# Example DataFrame (assuming df is the DataFrame containing your data)
# Apply the function to the dataframe
processed_df = process_reembark(df)

  df['Emb_date'] = pd.to_datetime(df['Emb_date'], errors='coerce').dt.strftime('%Y-%m-%d')
  df['Disemb_date'] = pd.to_datetime(df['Disemb_date'], errors='coerce').dt.strftime('%Y-%m-%d')
