In [1]:
import pandas as pd
import re

def process_toi_text(df, toi_text):
    # Filter rows containing the specified TOI_Text
    mask = df['TOI'].str.contains(toi_text, na=False)
    filtered_df = df[mask].copy()
    
    # Identify required columns
    required_cols = ['Character_index', 'Word_index', 'Sentence_index']
    
    # Sort rows where Character_index is None by Sentence_index
    none_char_index = filtered_df[filtered_df['Character_index'].isna()].sort_values('Sentence_index')
    
    # Sort rows where Word_index is 1 by Sentence_index
    word_index_1 = filtered_df[(filtered_df['Word_index'] == 1) & filtered_df['Character_index'].notna()].sort_values('Sentence_index')
    
    # Sort remaining rows by Sentence_index and Character_index
    remaining = filtered_df[~filtered_df.index.isin(none_char_index.index) & ~filtered_df.index.isin(word_index_1.index)]
    remaining_sorted = remaining.sort_values(['Sentence_index', 'Character_index'])
    
    # Combine sorted dataframes
    combined_result = pd.concat([none_char_index, word_index_1, remaining_sorted])
    
    # Group by Sentence_index and sort Word_index within each group
    combined_result = combined_result.sort_values('Sentence_index')
    grouped_result = combined_result.groupby('Sentence_index')
    final_result = pd.concat([group.sort_values(['Word_index', 'Character_index']) for _, group in grouped_result])
    
    return final_result

def main(input_file, output_file):
    # Read TSV file
    df = pd.read_csv(input_file, sep='\t', encoding='utf-8')
    
    # Process each TOI_Text in order
    toi_texts = sorted([col for col in df['TOI'].unique() if re.match(r'TOI_Text\d+', str(col))])
    
    result_dfs = []
    for toi_text in toi_texts:
        result_dfs.append(process_toi_text(df, toi_text))
    
    # Combine all processed dataframes
    final_result = pd.concat(result_dfs)
    
    # Move rows with "Text_unit_type" == "Sentence" to the top
    sentences = final_result[final_result['Text_unit_type'] == 'Sentence']
    other_rows = final_result[final_result['Text_unit_type'] != 'Sentence']
    final_result = pd.concat([sentences, other_rows])
    
    # Write to CSV file
    final_result.to_csv(output_file, index=False, encoding='utf-8-sig')

if __name__ == "__main__":
    input_file = "E:\実験\実験結果\卒研B/27. 文章42\Kaidhil_文章42.tsv"  # TSVファイルのパス
    output_file = "Kaidhil_文章42.csv"  # 出力するCSVファイルのパス
    main(input_file, output_file)
