In [1]:
import pandas as pd
from glob import glob
import os
from typing import List

# Read file
def read_tsv(filename: str, columns_to_extract: List[int], skip_first_line: bool = False, file_encoding: str = "utf8") -> List[List[str]]:
    extracted_data = []

    try:
        with open(filename, "r", encoding=file_encoding) as tsv_file:
            if skip_first_line:
                tsv_file.readline()

            line = tsv_file.readline()

            while line:
                line_list = line.strip().split("\t")
                extracted_row = [line_list[column_index] for column_index in columns_to_extract]
                extracted_data.append(extracted_row)
                line = tsv_file.readline()

    except FileNotFoundError:
        print(f"file '{filename}' non-existent.")

    return extracted_data

# Processing data
def filter_sequence(raw):
    aa_set = set(["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"])

    result = [seq for seq in raw if (
        10 <= len(seq[0]) <= 24 and
        all(aa in aa_set for aa in seq[0]) and
        seq[0][0].upper() == "C" and
        seq[0][-1].upper() == "F"
    )]

    return result

source_dir = "../1/"
output =  "../Data/RA/"
if not os.path.exists(output):
    os.makedirs(output)

file_list = glob(source_dir + "*.tsv")
for i, f in enumerate(file_list):
    df = pd.read_table(f)
    df = df[['amino_acid', 'frequency']]
    df.columns = ["TCR", "Abundance"]
    df = df.dropna()
    df = df.sort_values(by="Abundance", ascending=False).iloc[:, :]
    df.reset_index(inplace=True, drop=True)
    info_index = [0, 1]
    raw_file = df.values.tolist()
    processed_file = filter_sequence(raw_file)
    output_file = sorted(processed_file, key=lambda x: float(x[1]), reverse=True)[:100]
    output_file_path = os.path.join(output, f"P_{i+1}.tsv")
    with open(output_file_path, "w", encoding="utf8") as output_f:
        output_f.write("TCR\tAbundance\n")
        for tcr in output_file:
            output_f.write("{0}\t{1}\n".format(tcr[0], tcr[1]))
    print("Saved processed documents: " + output_file_path)


Saved processed documents: ../Data/RA/P_1.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_2.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_3.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_4.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_5.tsv
Saved processed documents: ../Data/RA/P_6.tsv
Saved processed documents: ../Data/RA/P_7.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_8.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_9.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_10.tsv
Saved processed documents: ../Data/RA/P_11.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_12.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_13.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_14.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_15.tsv
Saved processed documents: ../Data/RA/P_16.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_17.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_18.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_19.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_20.tsv
Saved processed documents: ../Data/RA/P_21.tsv
Saved processed documents: ../Data/RA/P_22.tsv
Saved processed documents: ../Data/RA/P_23.tsv
Saved processed documents: ../Data/RA/P_24.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_25.tsv
Saved processed documents: ../Data/RA/P_26.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_27.tsv
Saved processed documents: ../Data/RA/P_28.tsv
Saved processed documents: ../Data/RA/P_29.tsv
Saved processed documents: ../Data/RA/P_30.tsv
Saved processed documents: ../Data/RA/P_31.tsv
Saved processed documents: ../Data/RA/P_32.tsv
Saved processed documents: ../Data/RA/P_33.tsv
Saved processed documents: ../Data/RA/P_34.tsv
Saved processed documents: ../Data/RA/P_35.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_36.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_37.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_38.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_39.tsv
Saved processed documents: ../Data/RA/P_40.tsv
Saved processed documents: ../Data/RA/P_41.tsv
Saved processed documents: ../Data/RA/P_42.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_43.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_44.tsv
Saved processed documents: ../Data/RA/P_45.tsv
Saved processed documents: ../Data/RA/P_46.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_47.tsv
Saved processed documents: ../Data/RA/P_48.tsv
Saved processed documents: ../Data/RA/P_49.tsv
Saved processed documents: ../Data/RA/P_50.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_51.tsv
Saved processed documents: ../Data/RA/P_52.tsv
Saved processed documents: ../Data/RA/P_53.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_54.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_55.tsv
Saved processed documents: ../Data/RA/P_56.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_57.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_58.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_59.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_60.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_61.tsv
Saved processed documents: ../Data/RA/P_62.tsv
Saved processed documents: ../Data/RA/P_63.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_64.tsv
Saved processed documents: ../Data/RA/P_65.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_66.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_67.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_68.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_69.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_70.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_71.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_72.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_73.tsv
Saved processed documents: ../Data/RA/P_74.tsv
Saved processed documents: ../Data/RA/P_75.tsv
Saved processed documents: ../Data/RA/P_76.tsv
Saved processed documents: ../Data/RA/P_77.tsv
Saved processed documents: ../Data/RA/P_78.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_79.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_80.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_81.tsv
Saved processed documents: ../Data/RA/P_82.tsv
Saved processed documents: ../Data/RA/P_83.tsv
Saved processed documents: ../Data/RA/P_84.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_85.tsv
Saved processed documents: ../Data/RA/P_86.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_87.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_88.tsv
Saved processed documents: ../Data/RA/P_89.tsv
Saved processed documents: ../Data/RA/P_90.tsv


  df = pd.read_table(f)


Saved processed documents: ../Data/RA/P_91.tsv
Saved processed documents: ../Data/RA/P_92.tsv
Saved processed documents: ../Data/RA/P_93.tsv
Saved processed documents: ../Data/RA/P_94.tsv
