# Script to quantify variations minimum of the Triticum monoccocum accesison by windows from IBSpy combined tables by chromsome 

In [1]:
import pandas as pd
import numpy as np

## Functions

In [2]:
def select_genotypes(df, sample_list):
    filter_df = df.set_index(['seqname', 'start', 'end']).filter(items=sample_list, axis=1)
    return filter_df.reset_index()

def file_to_list(file):
    file_l = pd.read_csv(file)['genotype'].tolist()
    return file_l

def get_min(df, list_1, list_2):
    monoc_df = select_genotypes(df, list_1)
    out_df = select_genotypes(df, list_2)
    out_df.insert(3, 'monococcum_min', monoc_df.iloc[:, 3:].min(axis=1))
    return out_df

##  Arguments

In [68]:
window = 50000
function = np.sqrt
reference = 'arinaLrFor'
data_path ='./IBSpy_tables/'

## Input IBSpy variaitons table data

In [69]:
in_file = pd.read_csv(f'{data_path}/{reference}_combined_queries_{window}w.tsv.gz', delimiter='\t')
in_file

Unnamed: 0,seqname,start,end,TA10573,TA582,TA571,TA757,TA443,TA394,TA10418,...,arina-pg,chinese-pg,jagger-pg,julius-pg,lancer-pg,landmark-pg,mace-pg,mattis-pg,norin61-pg,stanley-pg
0,chr1A,1,50000,404,356,364,409,416,367,367,...,0,429,433,392,427,430,458,4,421,428
1,chr1A,50001,100000,316,296,292,320,302,290,290,...,0,424,436,402,428,430,433,2,438,428
2,chr1A,100001,150000,300,531,528,538,551,535,530,...,0,582,586,569,582,588,569,2,603,581
3,chr1A,150001,200000,325,480,486,481,463,488,462,...,0,521,560,518,548,535,543,1,556,532
4,chr1A,200001,250000,101,106,90,95,103,103,99,...,0,178,217,215,227,232,213,19,217,269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293188,chrUn,464450001,464500000,571,589,585,578,581,592,568,...,0,66,123,67,109,101,111,87,84,101
293189,chrUn,464500001,464550000,486,480,481,476,488,489,475,...,0,96,138,93,124,139,103,57,146,81
293190,chrUn,464550001,464600000,513,522,494,493,496,512,500,...,0,80,90,37,111,139,110,68,106,160
293191,chrUn,464600001,464650000,501,514,486,502,482,500,479,...,0,122,122,96,118,98,117,50,137,92


## Filter by sub-genome A

In [70]:
by_genome = in_file[in_file['seqname'].str.contains('A')].copy()
by_genome

Unnamed: 0,seqname,start,end,TA10573,TA582,TA571,TA757,TA443,TA394,TA10418,...,arina-pg,chinese-pg,jagger-pg,julius-pg,lancer-pg,landmark-pg,mace-pg,mattis-pg,norin61-pg,stanley-pg
0,chr1A,1,50000,404,356,364,409,416,367,367,...,0,429,433,392,427,430,458,4,421,428
1,chr1A,50001,100000,316,296,292,320,302,290,290,...,0,424,436,402,428,430,433,2,438,428
2,chr1A,100001,150000,300,531,528,538,551,535,530,...,0,582,586,569,582,588,569,2,603,581
3,chr1A,150001,200000,325,480,486,481,463,488,462,...,0,521,560,518,548,535,543,1,556,532
4,chr1A,200001,250000,101,106,90,95,103,103,99,...,0,178,217,215,227,232,213,19,217,269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251499,chr7A,756100001,756150000,243,273,264,295,290,261,250,...,0,214,154,215,9,214,13,170,151,220
251500,chr7A,756150001,756200000,319,307,311,321,323,321,315,...,0,276,239,274,6,275,9,336,239,274
251501,chr7A,756200001,756250000,405,327,353,351,377,379,337,...,0,442,451,442,9,435,19,441,453,433
251502,chr7A,756250001,756300000,477,462,465,406,416,417,435,...,0,416,392,422,5,425,8,423,394,423


## Get monococcum minimum variaitons count by window

In [71]:
monoc_list = file_to_list('monococcum_genotypes.tsv')
genotypes_list = file_to_list('genotypes.tsv')
min_df = get_min(by_genome, monoc_list, genotypes_list)
min_df

Unnamed: 0,seqname,start,end,monococcum_min,TA10573,TA582,TA571,TA757,TA443,TA394,...,arina-pg,chinese-pg,jagger-pg,julius-pg,lancer-pg,landmark-pg,mace-pg,mattis-pg,norin61-pg,stanley-pg
0,chr1A,1,50000,350,404,356,364,409,416,367,...,0,429,433,392,427,430,458,4,421,428
1,chr1A,50001,100000,271,316,296,292,320,302,290,...,0,424,436,402,428,430,433,2,438,428
2,chr1A,100001,150000,257,300,531,528,538,551,535,...,0,582,586,569,582,588,569,2,603,581
3,chr1A,150001,200000,258,325,480,486,481,463,488,...,0,521,560,518,548,535,543,1,556,532
4,chr1A,200001,250000,79,101,106,90,95,103,103,...,0,178,217,215,227,232,213,19,217,269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99398,chr7A,756100001,756150000,219,243,273,264,295,290,261,...,0,214,154,215,9,214,13,170,151,220
99399,chr7A,756150001,756200000,253,319,307,311,321,323,321,...,0,276,239,274,6,275,9,336,239,274
99400,chr7A,756200001,756250000,237,405,327,353,351,377,379,...,0,442,451,442,9,435,19,441,453,433
99401,chr7A,756250001,756300000,234,477,462,465,406,416,417,...,0,416,392,422,5,425,8,423,394,423


## Split by chromosome & save tables

In [67]:
chromosomes = ('chr1A', 'chr2A', 'chr3A', 'chr4A', 'chr5A','chr6A','chr7A')
for chromosome in chromosomes:
    chrom_ind = min_df[min_df['seqname'].str.contains(chromosome)].copy()
    chrom_ind.to_csv(f'{reference}_{chromosome}_variations_all_genotypes.tsv.gz', sep='\t', index=False)