In [None]:
import pandas as pd
from glob import glob
import os
from typing import List

# Define the read file function
def read_tsv(filename: str, columns_to_extract: List[int], skip_first_line: bool = False, file_encoding: str = "utf8") -> List[List[str]]:
    extracted_data = []

    try:
        with open(filename, "r", encoding=file_encoding) as tsv_file:
            if skip_first_line:
                tsv_file.readline()

            line = tsv_file.readline()

            while line:
                line_list = line.strip().split("\t")
                extracted_row = [line_list[column_index] for column_index in columns_to_extract]
                extracted_data.append(extracted_row)
                line = tsv_file.readline()

    except FileNotFoundError:
        print(f"File '{filename}' non-existent.")

    return extracted_data

# Define data processing functions
def filter_sequence(raw):
    aa_set = set(["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"])

    result = [seq for seq in raw if (
        10 <= len(seq[0]) <= 24 and
        all(aa in aa_set for aa in seq[0]) and
        seq[0][0].upper() == "C" and
        seq[0][-1].upper() == "F"
    )]

    return result


source_dir = #Original file path
output = #Save the path to the processed file

if not os.path.exists(output):
    os.makedirs(output)
    
# Processing data
file_list = glob(source_dir + "*.tsv")
for i, f in enumerate(file_list):
    df = pd.read_table(f, low_memory=False)  # 添加 low_memory=False
    df = df[['amino_acid', 'frequency']]
    df.columns = ["TCR", "Abundance"]
    df = df.dropna()
    df = df.sort_values(by="Abundance", ascending=False).iloc[:, :]
    df.reset_index(inplace=True, drop=True)

    info_index = [0, 1]

    raw_file = df.values.tolist()
    processed_file = filter_sequence(raw_file)
    output_file = sorted(processed_file, key=lambda x: float(x[1]), reverse=True)[:100]

    output_file_path = os.path.join(output, f"P_{i+1}.tsv")

    with open(output_file_path, "w", encoding="utf8") as output_f:
        output_f.write("TCR\tAbundance\n")

        for tcr in output_file:
            output_f.write("{0}\t{1}\n".format(tcr[0], tcr[1]))

    print("Saved processed file paths: " + output_file_path)

    # Check file lines and delete non-compliant files
    with open(output_file_path, "r", encoding="utf8") as file:
        lines = file.readlines()

    if len(lines) < 11:  
        os.remove(output_file_path)
        print(f"Deleted file due to insufficient data: {output_file_path}")