In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil.parser import parse

In [2]:
df_r = pd.read_csv('record.csv', low_memory=False).drop(columns=['file_name', 'age', 'build', 'complexion', 'conscription', 'wage', 'f_n', 'zot_title', 'date_of_birth', 'date_of_death', 'medical_issues', 'passage_paid_by', 'days_on_voyage', 'earnings']).rename(columns={'id': 'record_id'})
df_p = pd.read_csv('person.csv', low_memory=False).drop(columns=['alternate_last_name', 'alternate_first_name', 'parents', 'gender', 'families_groups']).rename(columns={'id': 'person_id'})
df_v = pd.read_csv('voyage.csv', low_memory=False).drop(columns=['date', 'notes', 'route_class', 'fate', 'call_number', 'operations', 'ship_notes', 'ships_encountered', 'route_class_1']).rename(columns={'id': 'voyage_id'})
df_s = pd.read_csv('ship.csv', low_memory=False).drop(columns=['ship_built_date', 'ship_built_place', 'ship_type', 'tonnage', 'guns']).rename(columns={'id': 'ship_id'})
df_f = pd.read_csv('functions.csv', low_memory=False).drop(columns=['func_class_1', 'func_class_2']).rename(columns={'id': 'function_id'})
df1 = pd.merge(df_r, df_p, on='person_index', how='inner')
df2 = pd.merge(df1, df_v, on='voyage_index', how='inner')
df3 = pd.merge(df2, df_s, on='ship_index', how='inner')
df = pd.merge(df3, df_f, on='function_index', how='inner').drop(columns=['record_index', 'record_index', 'ship_index', 'voyage_index', 'function_index'])

In [3]:
df_r.iloc[0]

record_id                          3
n                                3.0
remarks           a fait la campagne
record_index                       2
emb_date                  21/03/1722
emb_loc                     Lorient 
emb_class                      301.0
disemb_date               16/12/1722
disemb_loc                   Lorient
disemb_class                   301.0
person_index                       2
origin_index                       2
function_index                     2
ship_index                         0
voyage_index                       0
Name: 0, dtype: object

In [11]:
df_r

Unnamed: 0,record_id,n,remarks,record_index,emb_date,emb_loc,emb_class,disemb_date,disemb_loc,disemb_class,person_index,origin_index,function_index,ship_index,voyage_index
0,3,3.0,a fait la campagne,2,21/03/1722,Lorient,301.0,16/12/1722,Lorient,301.0,2,2,2,0,0
1,4,4.0,a fait la campagne,3,21/03/1722,Lorient,301.0,16/12/1722,Lorient,301.0,3,3,2,0,0
2,6,6.0,a fait la campagne,5,21/03/1722,Lorient,301.0,16/12/1722,Lorient,301.0,5,1,2,0,0
3,7,7.0,a fait la campagne,6,21/03/1722,Lorient,301.0,16/12/1722,Lorient,301.0,6,1,2,0,0
4,8,8.0,a fait la campagne,7,21/03/1722,Lorient,301.0,16/12/1722,Lorient,301.0,7,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218567,91214,735.0,embarqué à l'île de France ou à l'île Bourbon ...,91213,,,,08/02/1757,Lorient,301.0,215658,0,194,500,2695
218568,91216,737.0,embarqué à l'île de France ou à l'île Bourbon ...,91215,,,,08/02/1757,Lorient,301.0,215659,0,606,500,2695
218569,91224,745.0,embarqué à l'île de France ou à l'île Bourbon ...,91223,,,,08/02/1757,Lorient,301.0,215660,0,27,500,2695
218570,91225,746.0,embarqué à l'île de France ou à l'île Bourbon ...,91224,,,,08/02/1757,Lorient,301.0,215661,0,27,500,2695


In [20]:
df_r[df_r['person_index']==215663]

Unnamed: 0,record_id,n,remarks,record_index,emb_date,emb_loc,emb_class,disemb_date,disemb_loc,disemb_class,person_index,origin_index,function_index,ship_index,voyage_index
218565,90548,69.0,a fait la campagne --- habitué à Lorient,90547,28/11/1754,Lorient,301.0,08/02/1757,Lorient,301.0,215663,22,36,500,2695


In [17]:
df_r = df_r.sort_values(by='record_id')

In [None]:
current_index = df_r[df_r['person_index'] == 215663].index

# Check if the row exists and get the next row's index
if not current_index.empty and current_index[0] + 1 < len(df_r):
    next_row_index = current_index[0] + 1

In [16]:
df_r.iloc[218565]

record_id                                        218566
n                                                  38.0
remarks           embarqué à l'armement \n débarqué à ?
record_index                                     218565
emb_date                                     18/12/1742
emb_loc                                            Port
emb_class                                         301.0
disemb_date                                  05/01/1743
disemb_loc                                            ?
disemb_class                                       None
person_index                                       2850
origin_index                                        207
function_index                                       82
ship_index                                           14
voyage_index                                        104
Name: 33359, dtype: object

In [19]:
df_r[df_r['person_index']==215663].index

Int64Index([218565], dtype='int64')

In [30]:
def combine_remarks(series):
    # Combine non-null strings, separating by newline
    return ' \n '.join(series.dropna().astype(str))

def valid_date(dates):
    # Filter to get potential valid or date-like strings
    date_candidates = [date for date in dates if date and any(char.isdigit() for char in str(date))]
    for date in date_candidates:
        try:
            # Attempt to parse the date
            parse(date, dayfirst=True)
            return date  # Return the first successfully parsed date
        except ValueError:
            continue
    # If no valid date, return the first candidate or None
    return date_candidates[0] if date_candidates else None

def resolve_location(locations):
    # Filter out None and '?' then return the first valid location, or None if no valid locations
    valid_locations = [loc for loc in locations if loc and loc != '?']
    return valid_locations[0] if valid_locations else None

# Define aggregation dictionary
aggregations = {
    'remarks': combine_remarks,
    'emb_date': valid_date,
    'disemb_date': valid_date,
    'emb_loc': resolve_location,
    'disemb_loc': resolve_location
}

# Group and aggregate
result = df_r.groupby(['person_index', 'voyage_index', 'ship_index', 'n'], as_index=False).agg(aggregations)

In [31]:
result

Unnamed: 0,person_index,voyage_index,ship_index,n,remarks,emb_date,disemb_date,emb_loc,disemb_loc
0,0,0,0,1.0,embarqué à l'armement \n débarqué à ? \n a fai...,21/03/1722,16/12/1722,Lorient,Lorient
1,1,0,0,2.0,a fait la campagne \n embarqué à l'armement \n...,21/03/1722,16/12/1722,Lorient,Lorient
2,2,0,0,3.0,a fait la campagne \n embarqué à l'armement \n...,21/03/1722,16/12/1722,Lorient,Lorient
3,3,0,0,4.0,a fait la campagne \n embarqué à l'armement \n...,21/03/1722,16/12/1722,Lorient,Lorient
4,4,0,0,5.0,embarqué à l'armement \n débarqué à ? \n a fai...,21/03/1722,16/12/1722,Lorient,Lorient
...,...,...,...,...,...,...,...,...,...
218556,215659,2695,500,737.0,embarqué à l'île de France ou à l'île Bourbon ...,,08/02/1757,,Lorient
218557,215660,2695,500,745.0,embarqué à l'île de France ou à l'île Bourbon ...,,08/02/1757,,Lorient
218558,215661,2695,500,746.0,embarqué à l'île de France ou à l'île Bourbon ...,,08/02/1757,,Lorient
218559,215662,2695,500,749.0,embarqué à l'île de France ou à l'île Bourbon ...,,08/02/1757,,Lorient


In [39]:
result[result['person_index']==1].iloc[0]

person_index                                                    1
voyage_index                                                    0
ship_index                                                      0
n                                                             2.0
remarks         a fait la campagne \n embarqué à l'armement \n...
emb_date                                               21/03/1722
disemb_date                                            16/12/1722
emb_loc                                                  Lorient 
disemb_loc                                                Lorient
Name: 1, dtype: object

In [38]:
df_r[df_r['person_index']==1].iloc[1]

record_id                                        215666
n                                                   2.0
remarks           embarqué à l'armement \n débarqué à ?
record_index                                     215665
emb_date                                     21/03/1722
emb_loc                                        Lorient 
emb_class                                         301.0
disemb_date                                  16/12/1722
disemb_loc                                            ?
disemb_class                                       None
person_index                                          1
origin_index                                          1
function_index                                        1
ship_index                                            0
voyage_index                                          0
Name: 494, dtype: object

In [None]:
temp = list(set(df_r['person_index']))
for i in temp:
    df = df_r[df_r['person_index']==i]
    if len(df) == 2:
        if df.iloc[0]['n'] == df.iloc[1]['n'] and df.iloc[0]['ship_index'] == df.iloc[1]['ship_index'] and df.iloc[0]['voyage_index'] == df.iloc[1]['voyage_index']: