# Script to quantify variations minimum of the Triticum monoccocum accesison by windows from IBSpy combined tables by chromsome 

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns

## Functions

In [28]:
def select_genotypes(df, sample_list):
    filter_df = df.set_index(['seqname', 'start', 'end']).filter(items=sample_list, axis=1)
    return filter_df.reset_index()

def file_to_list(file):
    file_l = pd.read_csv(file)['genotype'].tolist()
    return file_l

def get_min(df, list_1, list_2):
    monoc_df = select_genotypes(df, list_1)
    out_df = select_genotypes(df, list_2)
    out_df.insert(3, 'monoccoccum_min', monoc_df.iloc[:, 3:].min(axis=1))
    return out_df

##  Arguments

In [25]:
window = 50000
function = np.sqrt
reference = 'arinalrfor'
data_path ='./IBSpy_tables/'

## Input IBSpy variaitons table data

In [26]:
in_file = pd.read_csv(f'{data_path}/{reference}_combined_queries_{window}w.tsv.gz', delimiter='\t')
in_file

Unnamed: 0,seqname,start,end,TA10573,TA582,TA571,TA757,TA443,TA394,TA10418,...,mace-pg,mattis-pg,norin61-pg,spelt-pg,stanley-pg,claire2,paragon2,robigus2,weebil2,cadenza2
0,chr1A,1,50000,404,356,364,409,416,367,367,...,458,4,421,459,428,377,389,374,452,425
1,chr1A,50001,100000,316,296,292,320,302,290,290,...,433,2,438,435,428,420,401,423,450,427
2,chr1A,100001,150000,300,531,528,538,551,535,530,...,569,2,603,588,581,559,565,582,575,555
3,chr1A,150001,200000,325,480,486,481,463,488,462,...,543,1,556,582,532,524,543,542,542,525
4,chr1A,200001,250000,101,106,90,95,103,103,99,...,213,19,217,245,269,189,181,187,208,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293188,chrUn,464450001,464500000,571,589,585,578,581,592,568,...,111,87,84,148,101,46,37,40,21,64
293189,chrUn,464500001,464550000,486,480,481,476,488,489,475,...,103,57,146,117,81,24,62,55,66,50
293190,chrUn,464550001,464600000,513,522,494,493,496,512,500,...,110,68,106,128,160,8,38,19,37,72
293191,chrUn,464600001,464650000,501,514,486,502,482,500,479,...,117,50,137,138,92,78,42,100,50,68


## Filter by sub-genome A

In [20]:
by_genome = in_file[in_file['seqname'].str.contains('A')].copy()
by_genome

Unnamed: 0,seqname,start,end,TA10573,TA582,TA571,TA757,TA443,TA394,TA10418,...,mace-pg,mattis-pg,norin61-pg,spelt-pg,stanley-pg,claire2,paragon2,robigus2,weebil2,cadenza2
0,chr1A,1,50000,404,356,364,409,416,367,367,...,458,4,421,459,428,377,389,374,452,425
1,chr1A,50001,100000,316,296,292,320,302,290,290,...,433,2,438,435,428,420,401,423,450,427
2,chr1A,100001,150000,300,531,528,538,551,535,530,...,569,2,603,588,581,559,565,582,575,555
3,chr1A,150001,200000,325,480,486,481,463,488,462,...,543,1,556,582,532,524,543,542,542,525
4,chr1A,200001,250000,101,106,90,95,103,103,99,...,213,19,217,245,269,189,181,187,208,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251499,chr7A,756100001,756150000,243,273,264,295,290,261,250,...,13,170,151,144,220,5,149,5,209,210
251500,chr7A,756150001,756200000,319,307,311,321,323,321,315,...,9,336,239,271,274,21,273,23,285,272
251501,chr7A,756200001,756250000,405,327,353,351,377,379,337,...,19,441,453,440,433,25,448,16,437,447
251502,chr7A,756250001,756300000,477,462,465,406,416,417,435,...,8,423,394,421,423,3,427,2,434,419


## Get monococcum minimum variaitons count by window

In [22]:
monoc_list = file_to_list('monococcum_genotypes.tsv')
genotypes_list = file_to_list('genotypes.tsv')
min_df = get_min(by_genome, monoc_list, genotypes_list)
min_df

Unnamed: 0,seqname,start,end,monoccoccum_min,TA10573,TA582,TA571,TA757,TA443,TA394,...,mace-pg,mattis-pg,norin61-pg,spelt-pg,stanley-pg,claire2,paragon2,robigus2,weebil2,cadenza2
0,chr1A,1,50000,350,404,356,364,409,416,367,...,458,4,421,459,428,377,389,374,452,425
1,chr1A,50001,100000,271,316,296,292,320,302,290,...,433,2,438,435,428,420,401,423,450,427
2,chr1A,100001,150000,257,300,531,528,538,551,535,...,569,2,603,588,581,559,565,582,575,555
3,chr1A,150001,200000,258,325,480,486,481,463,488,...,543,1,556,582,532,524,543,542,542,525
4,chr1A,200001,250000,79,101,106,90,95,103,103,...,213,19,217,245,269,189,181,187,208,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99398,chr7A,756100001,756150000,219,243,273,264,295,290,261,...,13,170,151,144,220,5,149,5,209,210
99399,chr7A,756150001,756200000,253,319,307,311,321,323,321,...,9,336,239,271,274,21,273,23,285,272
99400,chr7A,756200001,756250000,237,405,327,353,351,377,379,...,19,441,453,440,433,25,448,16,437,447
99401,chr7A,756250001,756300000,234,477,462,465,406,416,417,...,8,423,394,421,423,3,427,2,434,419


## Split by chromosome & save tables

In [23]:
chromosomes = ('chr1A', 'chr2A', 'chr3A', 'chr4A', 'chr5A','chr6A','chr7A')
for chromosome in chromosomes:
    chrom_ind = min_df[min_df['seqname'].str.contains(chromosome)].copy()
    chrom_ind.to_csv(f'{reference}_{chromosome}_variations_all_genotypes.tsv', sep='\t', index=False)