In [133]:
import pandas as pd

file_path = 'crawled_start.csv'
df = pd.read_csv(file_path).drop('Unnamed: 0', axis=1).dropna(how='all', axis=1).dropna(how='all', axis=0)
for col in df.select_dtypes(include=['object']):
    df[col] = df[col].str.strip()

In [134]:
def match_or_nan(a, b):
    # Function to check if two values are equal or both are NaN
    return (a == b) | (pd.isna(a) & pd.isna(b))

def check_issues(row):
    issues = []
    if not pd.isna(row['Emb_date']) and not pd.Series(row['Emb_date']).str.match(r'^\d{2}/\d{2}/\d{4}$', na=False).bool():
        issues.append('emb_date')
    if not pd.isna(row['Disemb_date']) and not pd.Series(row['Disemb_date']).str.match(r'^\d{2}/\d{2}/\d{4}$', na=False).bool():
        issues.append('disemb_date')
    if pd.isna(row['Emb_loc']):
        issues.append('emb_loc')
    if pd.isna(row['Disemb_loc']):
        issues.append('disemb_loc')
    return issues

In [135]:
reembark_rows = df[(df['Remarks'].str.contains('rembarqué', na=False, case=False)) & (df['Emb_loc'].isna())]

# Step 5: Process each row with the "rembarqué" keyword individually
for index, row in reembark_rows.iterrows():
    # Find rows with matching 'Last Name', 'First Name', and 'Function', handling NaN values
    matching_rows = df[
        match_or_nan(df['Last Name'], row['Last Name']) &
        match_or_nan(df['First Name'], row['First Name']) &
        match_or_nan(df['Function'], row['Function'])
    ]

    # Sort the matching rows by Emb_date
    sorted_group = matching_rows.sort_values(by=['Emb_date', 'Disemb_date'])

    # Iterate through the sorted group to assign emb_loc based on previous Disemb_loc
    previous_disemb_loc = None
    for i, (sorted_index, sorted_row) in enumerate(sorted_group.iterrows()):
        if sorted_index == index:
            if previous_disemb_loc:
                df.at[sorted_index, 'Emb_loc'] = previous_disemb_loc

        # Update previous_disemb_loc for the next row in the sequence
        previous_disemb_loc = sorted_row['Disemb_loc']
        
for _, group in df.groupby(['Last Name', 'First Name', 'Function'], dropna=False):
    sorted_group = group.sort_values(by=['Emb_date', 'Disemb_date'])
    for i, (index, row) in enumerate(sorted_group.iterrows()):
        if pd.isna(row['Disemb_loc']) and i < len(sorted_group) - 1:
            next_row = sorted_group.iloc[i + 1]
            df.at[index, 'Disemb_loc'] = next_row['Emb_loc']

In [136]:
df['issued'] = df.apply(check_issues, axis=1)

In [137]:
for index, row in df.iterrows():
    if row['Disemb_loc'] == 'ration' or row['Disemb_loc'] == 'mer et mort noyé':
        df.at[index, 'issued'].append('disemb_loc')

In [138]:
df.to_csv('flag_corrupted.csv')

In [122]:
df[df['issued'].apply(lambda x: len(x) > 0)]

Unnamed: 0,n,Last Name,First Name,Function,Remarks,Age,Wage,Emb_date,Disemb_date,Disemb_loc,Fate,Left La Corunna by barque,Travel expenses,Emb_loc,emb_class,disemb_class,issued
1,310.0,AGOUSTIN,Colas,matelot lascar,supplément à Chandernagor du 01/02/1755 \n dés...,35.0,14.8,01/02/1755,19/08/1755,ration,supplément à Chandernagor du 01/02/1755 \n dés...,,,Chandernagor,302.0,304.0,[disemb_loc]
10,266.0,AUDRAIN,Pierre,passager clandestin,trouvé caché à bord après le départ de Lorient...,16.0,,[nan],05/09/1754,Pondichéry,trouvé caché à bord après le départ de Lorient...,,,Lorient,304.0,303.0,[emb_date]
48,264.0,BUCK,Lovell,passager clandestin,trouvé caché à bord après le départ de Lorient...,24.0,,[nan],10/09/1754,Pondichéry,,,,Lorient,304.0,302.0,[emb_date]
52,185.0,CALIBRE DIT PIERROT,Pierre,soldat passager,resté à terre malade au départ de Lorient --- ...,,7.1,[nan],[nan],Lorient,resté à terre malade au départ de Lorient --- ...,,,,308.0,308.0,"[emb_date, disemb_date, emb_loc]"
92,254.0,DALINOT,Simon,soldat passager,déserté au départ de Lorient --- soldat allema...,,,[nan],08/09/1756,Lorient,never boarded!,,,,,304.0,"[emb_date, emb_loc]"
93,1.0,DARGY DE LA CHÂTRE,Christophe,capitaine [officier],à fait la campagne de Lorient à La Corogne ---...,,200,[nan],09/10/1756,la Corogne,,,,Lorient,301.0,301.0,[emb_date]
192,311.0,JANY,Ram,matelot lascar,embarqué en supplément à ? le 10/08/1755 \n dé...,,14.8,10/08/1755,26/10/1755,Port-Louis île de France,embarqué en supplément à ? le 10/08/1755 \n dé...,,,,301.0,302.0,[emb_loc]
205,137.0,KERDANIEL,,enseigne passager,resté à terre au départ de Lorient --- passage...,,,09/03/1754,09/03/1754,Lorient,,,,,308.0,308.0,[emb_loc]
210,121.0,L'HELIEVE,Gilles,mousse,embarqué à l'armement \n tombé à la mer et mor...,16.0,7,09/03/1754,31/05/1754,mer et mort noyé,[filled correctly!],,,Lorient,301.0,305.0,[disemb_loc]
225,267.0,LAVIGNE,Jean François Honoré,pilotin passager,embarqué par erreur à l'armement \n débarqué à...,15.0,?,[nan],09/03/1754,Pondichéry,embarqué par erreur à l'armement \n débarqué à...,,,,301.0,302.0,"[emb_date, emb_loc]"
