In [None]:
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm
import os
import shutil
import numpy as np

In [None]:
# to parse pandas df. Extract columns, change start/end, add position, delete first and last 256 bp
# drop_nan_strategy = any - will delete columns with NaN in any column. This is for Zero Shot
# drop_nan_strategy = both - delete only if Nan in both; is for Fine Tuning
def prepare_df_for_PlantCad_zeroshot(inp_df, ref_col, alt_col, frame=512, drop_nan_strategy="any"):
    df = inp_df.copy()
    df = df[["chr", "start", "end", ref_col, alt_col]]
    
    # Delete NaN 
    if drop_nan_strategy == "any":
        df = df.dropna(subset=[ref_col, alt_col], how="any")
    elif drop_nan_strategy == "both":
        df = df[~(df[ref_col].isna() & df[alt_col].isna())]
    else:
        raise ValueError("Invalid drop_nan_strategy. Choose 'any' or 'both'.")
    
    # Применяем фильтр для удаления строк, где 'start' меньше (frame/2 - 1)
    df = df[df["start"] >= (frame // 2 - 1)]
    
    # Создаем DataFrame df_zero_shot_input_coordinates. Координаты pos начинаются с 1
    df_zero_shot_input_coordinates = pd.DataFrame({
        "chr": df["chr"],
        "start": df["start"] - (frame // 2 - 1),
        "end": df["start"] + 1 + (frame // 2),
        "pos": df["start"] + 1,
        "ref": df[ref_col],
        "alt": df[alt_col]
    })
    
    return df_zero_shot_input_coordinates


In [None]:
def add_seqs_to_df_input_coordinates(genome_fasta, df_zero_shot_input_coordinates):
    # Read the genome FASTA file
    genome = {record.id: record.seq for record in SeqIO.parse(genome_fasta, "fasta")}
    
    # Define a function to extract the sequence based on coordinates
    def extract_sequence(row):
        chrom = row['chr']
        start = row['start']
        end = row['end']
        # Check if chromosome exists in genome
        if chrom in genome:
            # Check if 'end' is greater than the sequence length
            seq_length = len(genome[chrom])
            if end > seq_length:
                return None
            # Extract the subsequence, convert to string and uppercase
            subsequence = genome[chrom][start:end] 
            return str(subsequence).upper()
        else:
            return None 

    # Add progress bar to the DataFrame processing
    tqdm.pandas(desc="Adding sequences")
    df_zero_shot_input_coordinates['sequences'] = df_zero_shot_input_coordinates.progress_apply(extract_sequence, axis=1)
    
    # Drop rows where 'sequence' is None (i.e., where 'end' was out of bounds)
    df_zero_shot_input_coordinates = df_zero_shot_input_coordinates.dropna(subset=['sequences'])
    
    return df_zero_shot_input_coordinates

In [None]:
# change 255 to ref state
def correct_sequence_ref(inp_df):
    df = inp_df.copy()
    
    def modify_sequence(row):
        # Заменяем символ на 255-й позиции в зависимости от label
        char_to_replace = row["ref"]
        sequence = row["sequences"]
        modified_sequence = sequence[:255] + char_to_replace + sequence[256:]
        return modified_sequence
    tqdm.pandas(desc="Correcting sequences")
    df["sequences"] = df.progress_apply(modify_sequence, axis=1)
    return df

In [None]:
# add column "label". ref = 0, alt = 1
def add_label_column(df):
    # Создаем DataFrame для строк, где только alt имеет значение
    alt_only = df[pd.notna(df["alt"]) & pd.isna(df["ref"])].copy()
    alt_only["label"] = 1

    # Создаем DataFrame для строк, где только ref имеет значение
    ref_only = df[pd.notna(df["ref"]) & pd.isna(df["alt"])].copy()
    ref_only["label"] = 0

    # Создаем DataFrame для строк, где оба значения не NaN
    both = df[pd.notna(df["alt"]) & pd.notna(df["ref"])].copy()
    both_1 = both.copy()
    both_1["label"] = 1
    both_0 = both.copy()
    both_0["label"] = 0

    # Объединяем все части в один DataFrame
    result_df = pd.concat([alt_only, ref_only, both_1, both_0], ignore_index=True)
    return result_df

In [None]:
def correct_sequence(df_fine_tune_input):
    df = df_fine_tune_input.copy()
    
    def modify_sequence(row):
        # Заменяем символ на 255-й позиции в зависимости от label
        char_to_replace = row["alt"] if row["label"] == 1 else row["ref"]
        sequence = row["sequences"]
        modified_sequence = sequence[:255] + char_to_replace + sequence[256:]
        return modified_sequence
    # Применяем функцию с прогрессом к DataFrame
    tqdm.pandas(desc="Correcting sequences")
    df["sequences"] = df.progress_apply(modify_sequence, axis=1)
    return df

# Create files

In [None]:
# prepare dataset for pop data (run on real data to find delterious mutations)
allele_dataset = '/home/labs/alevy/omerbar/validations/AT/vcfs/AT_clean_vcf_chr_Chr1.bed.gz'
genome_fasta = "/home/labs/alevy/omerbar/backups/TAIR/A_thaliana.fa"
output_file = "/home/labs/alevy/petrzhu/AI_workshop/PlantCaduceus/datasets/TAIR/TAIR10_FT_pop_data_ref_alt.txt"
README = "/home/labs/alevy/petrzhu/AI_workshop/PlantCaduceus/datasets/TAIR/REAMDE.txt"

ref_col = 'ref_allele' #label = 0
alt_col = 'alt_allele' #label = 1
    
# Use zero shot format to be changed next
print('dataset loading')
df_allele_dataset = pd.read_csv(allele_dataset, sep="\t", header=0, compression='gzip', na_values=["NA", "null", ".", "-", "n/a", "N/A", "NaN"])
df_allele_dataset = df_allele_dataset.rename(columns={'#chr': 'chr'})

print('change coordinates')
df_zero_shot_input_coordinates = prepare_df_for_PlantCad_zeroshot(df_allele_dataset, ref_col, alt_col, drop_nan_strategy="both") 
print('adding seqs')
df_zero_shot_input = add_seqs_to_df_input_coordinates(genome_fasta, df_zero_shot_input_coordinates)
# change zero shot to fine tune format
print('adding labels')
df_fine_tune_input = add_label_column(df_zero_shot_input)
print('correct sequences')
df_fine_tune_input = correct_sequence(df_fine_tune_input)
df_fine_tune_input = df_fine_tune_input.drop(columns=["start", "end", "ref", "alt"])
df_fine_tune_input.rename(columns={"chr": "chrom"}, inplace=True)
print('inputing labels')
df_fine_tune_input = df_fine_tune_input[["chrom", "pos", "label", "sequences"]]
print('saving file')
df_fine_tune_input.to_csv(output_file, index=False, sep="\t")

readme_text = f"{output_file}: ref_col = {ref_col}, alt_col = {alt_col}. ref_allele = label 0, alt_allele = label 1.\n"
! echo "{readme_text}" >> {README}

In [None]:
## Split train, val, test

In [None]:
from sklearn.model_selection import train_test_split

input_txt = "/home/labs/alevy/petrzhu/AI_workshop/PlantCaduceus/datasets/TAIR/TAIR10_FT_neutral_vs_simulated.txt"
input_df= pd.read_csv(input_txt, sep="\t", header=0, na_values=["NA", "null", ".", "-", "n/a", "N/A", "NaN"]) # , compression='gzip'
test_df = input_df[input_df['chrom'] == 'Chr1']
not_test_df = input_df[input_df['chrom'] != 'Chr1']


In [None]:
# function to split by chromosome ranges: split chromosome in partes (e.g. 10) and assign them to train and val randomly.
# Это нужно чтобы уйти от рандомного перетасовывания - потому что могут быть пересекающиеся участки)

# Разбиваем хромосому на фрагменты (в каждом может быть разные кол-во аллелей. Насколько это корректно?)
n_segments = 100

def assign_segments(group, n_segments):
    min_pos = group['pos'].min()
    max_pos = group['pos'].max()
    segment_length = (max_pos - min_pos) / n_segments
    
    # Создаем границы интервалов
    bins = [min_pos + i * segment_length for i in range(n_segments)]
    bins.append(max_pos + 1)  # Добавляем правую границу
    labels = list(range(1, n_segments + 1))  # Метки интервалов
    
    # Присваиваем отрезки
    group['segment'] = pd.cut(group['pos'], bins=bins, labels=labels, include_lowest=True)
    return group

# Применяем функцию для каждой группы хромосом
not_test_df_seg = not_test_df.groupby('chrom', group_keys=False).apply(assign_segments, n_segments=n_segments)

# not_test_df_seg['segment'].value_counts()


In [None]:
# Разделение данных: 80% на train, 20% на validation

#train_df, val_df = train_test_split(not_test_df, test_size=0.2, random_state=42)

from sklearn.model_selection import train_test_split

# Разделяем данные на train и test в пределах каждой хромосомы
def split_data_by_chrom_and_segment(df, test_size=0.2):
    train_list = []
    test_list = []
    
    for chrom, group in df.groupby('chrom'):
        # Уникальные сегменты для текущей хромосомы
        segments = group[['chrom', 'segment']].drop_duplicates()
        
        # Разделение сегментов на train и test с разным random_state
        train_segments, test_segments = train_test_split(
            segments,
            test_size=test_size,
            random_state=np.random.randint(0, 10000)  # Случайное состояние для каждой хромосомы
        )
        
        # Отбор строк для train и test
        train_list.append(group.merge(train_segments, on=['chrom', 'segment']))
        test_list.append(group.merge(test_segments, on=['chrom', 'segment']))
    
    # Объединяем результаты
    train = pd.concat(train_list, ignore_index=True)
    test = pd.concat(test_list, ignore_index=True)
    
    return train, test

# Применяем функцию
train_df, val_df = split_data_by_chrom_and_segment(not_test_df_seg)
#segments_by_chrom = val_df.groupby('chrom')['segment'].unique()
#segments_by_chrom - посмотреть как разбилось 
# Удалим колонку segment
train_df = train_df.drop(columns=['segment'])
val_df = val_df.drop(columns=['segment'])


In [None]:
output_test = input_txt.replace(".txt", "_test.txt")
test_df.to_csv(output_test, index=False, sep="\t")

output_train = input_txt.replace(".txt", "_train.txt")
train_df.to_csv(output_train, index=False, sep="\t")

output_val = input_txt.replace(".txt", "_val.txt")
val_df.to_csv(output_val, index=False, sep="\t")

README = "/home/labs/alevy/petrzhu/AI_workshop/PlantCaduceus/datasets/TAIR/REAMDE.txt"

readme_text = f"*****split******\n dataset {input_txt} was splitted in test (Chr1), train and val (Chr2345, 80/20) with use of 100 segments.\n*******"
! echo "{readme_text}" >> {README}

## Split by chunks for downstream parallel analysis

In [None]:
import os
import shutil

input_txt = "/home/labs/alevy/petrzhu/AI_workshop/PlantCaduceus/datasets/TAIR/TAIR10_FT_neutral_vs_simulated.txt"

output_test = input_txt.replace(".txt", "_test.txt")
output_train = input_txt.replace(".txt", "_train.txt")
output_val = input_txt.replace(".txt", "_val.txt")
input_df= pd.read_csv(input_txt, sep="\t", header=0, na_values=["NA", "null", ".", "-", "n/a", "N/A", "NaN"]) # , compression='gzip'
test_df= pd.read_csv(output_test, sep="\t", header=0, na_values=["NA", "null", ".", "-", "n/a", "N/A", "NaN"]) # , compression='gzip'
train_df= pd.read_csv(output_train, sep="\t", header=0, na_values=["NA", "null", ".", "-", "n/a", "N/A", "NaN"]) # , compression='gzip'
val_df= pd.read_csv(output_val, sep="\t", header=0, na_values=["NA", "null", ".", "-", "n/a", "N/A", "NaN"]) # , compression='gzip'

# Определяем размер чанка
chunk_size = 100000

# заменить output_train и train_df на нужные (например test or train or val...)
for name_df, df in zip([output_val], [val_df]):
    output_folder = name_df.replace(".txt", "_chunks")
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
        print(f"The content of the {output_folder} was deleted")
    os.makedirs(output_folder, exist_ok=True)
    chunk_count = 0

    # Добавляем сообщение перед началом цикла
    print(f"Splitting {len(df)} rows into chunks of size {chunk_size}")
    for i, chunk_start in enumerate(range(0, len(df), chunk_size)):
        chunk = df.iloc[chunk_start:chunk_start + chunk_size]
        chunk_file = os.path.join(output_folder, f"chunk_{i+1}.tsv")
        chunk.to_csv(chunk_file, sep="\t", index=False)
        chunk_count += 1
        # Печать прогресса
        if chunk_count % 10 == 0:
            print(f"{chunk_count} chunks saved...")
    print(f"All chunks have been saved to the folder: {output_folder}. Total chunks = {chunk_count}")


# чет зависает, но работу делает