In [1]:
from linkage_map.multi_family_mapping_functions import *

In [2]:
import os.path
import pandas as pd
import collections

### Write a file listing all identified paralogs

In [3]:
stats_file_1 = os.path.join('linkage_map','chum_08.stats')
stats_file_2 = os.path.join('linkage_map','chum_01.stats')
stats_file_3 = os.path.join('linkage_map','chum_09.stats')

paralogs_file = os.path.join('linkage_map','chum_paralogs.txt')

In [4]:
paralogs = set()
for stats_file in [stats_file_1, stats_file_2, stats_file_3]:
    with open(stats_file) as INFILE:
        for line in INFILE:
            if line.strip().split()[2] not in ['AA_xx', "AB"]:
                paralogs.add(line.strip().split()[0])
paralogs = sorted([int(xx) for xx in paralogs])
with open(paralogs_file, 'w') as OUTFILE:
    for xx in sorted(list(paralogs)):
        OUTFILE.write('{}\n'.format(xx))

### Import genotype data from MST map input files:

In [5]:
linkage_map_file_1 = os.path.join('linkage_map','chum_08_mstmap.txt')
linkage_map_file_2 = os.path.join('linkage_map','chum_01_mstmap.txt')
linkage_map_file_3 = os.path.join('linkage_map','chum_09_mstmap.txt')

In [6]:
individuals_08, genotypes_at_locus_08 = import_MSTmap(linkage_map_file_1)
individuals_01, genotypes_at_locus_01 = import_MSTmap(linkage_map_file_2)
individuals_09, genotypes_at_locus_09 = import_MSTmap(linkage_map_file_3)

my_pd_genos_08 = prep_data_pandas(individuals_08, genotypes_at_locus_08)
my_pd_genos_01 = prep_data_pandas(individuals_01, genotypes_at_locus_01)
my_pd_genos_09  = prep_data_pandas(individuals_09,  genotypes_at_locus_09)

fam_08 = prepare_matrix(my_pd_genos_08)
fam_01 = prepare_matrix(my_pd_genos_01)
fam_09 = prepare_matrix(my_pd_genos_09)

In [7]:
def rename_loci_by_family(paralogs_file, fam_names, families):
    # check if each family listed in families is formatted as if returned from prepare matrix
    for afam in families:
        if not isinstance(afam, pd.core.frame.DataFrame):
            raise ValueError("families should be a pandas DataFrame")
        else:
            pass
    if len(fam_names) != len(families) :
        raise ValueError("names and families should have the same length")
    if not isinstance(fam_names, list ):
        raise ValueError("names should be a list")
    
    with open(paralogs_file) as INFILE: 
        paralogs = [yy.strip() for yy in INFILE.readlines()]
    # for each family, for each locus, if the locus is a paralog, append family-specific text to locus name
    # genotypes are unchanged
    #new_familes = list()
    for idx, afam in enumerate(families):
        old_locus_names = afam.columns.values.tolist()
        new_locus_names = []
        for xx in old_locus_names:
            base_name = xx[:-3]
            if base_name in paralogs:
                #print("{} is a paralog".format(base_name))
                new_name = "{}_{}_{}".format(base_name, fam_names[idx], xx[-2:])
            else: 
                new_name = base_name
            new_locus_names.append(new_name)
        afam.columns = new_locus_names
    return(families)

In [8]:
renamed_08, renamed_01, renamed_09 = rename_loci_by_family(paralogs_file = paralogs_file, 
    fam_names = ['chum_08', 'chum_01', 'chum_09'], families = [my_pd_genos_08, my_pd_genos_01, my_pd_genos_09])

In [9]:
renamed_08.head()

Unnamed: 0,10001,10003_chum_08_x1,10020,10029,10039,10043,1004_chum_08_x1,10061,10068_chum_08_x1,10068_chum_08_x2,...,Oke_lactb2-71,Oke_mgll-49,Oke_nc2b-148,Oke_pnrc2-78,Oke_sylc-90,Oke_thic-84,Oke_txnrd1-74,Oke_u0602-244,Oke_u217-172,Oke_zn593-152
CMUW10X08H_0002,1,2,2,1,1,2,1,2,1,2,...,2,1,2,2,1,2,2,1,1,2
CMUW10X08H_0003,2,1,2,2,1,1,1,2,2,2,...,2,2,2,2,1,1,2,2,1,1
CMUW10X08H_0005,2,2,1,1,1,2,1,2,1,2,...,1,2,1,2,1,2,2,2,2,2
CMUW10X08H_0006,2,2,2,1,2,1,1,2,2,1,...,1,2,1,2,2,2,1,1,2,2
CMUW10X08H_0011,1,1,1,2,2,1,2,2,1,1,...,1,1,2,1,2,1,2,2,2,2


In [10]:
###### this should be moved inside function
renamed_08t = renamed_08.transpose()
renamed_01t = renamed_01.transpose()
renamed_09t = renamed_09.transpose()

In [11]:
aa = pd.merge(left = renamed_08t, right = renamed_01t, how = 'outer', left_index =True, right_index =True)

In [13]:
bb = pd.merge(left = aa, right = renamed_09t, how = 'outer', left_index =True, right_index =True) 
###### end move into function

In [14]:
all_my_data, loci_all = prepare_matrix(bb.transpose())

In [17]:
def write_LEPmap(families, family_names, loci, genotypes, output_filename):
    with open(output_filename, 'w') as OUTFILE:
        header = "\t".join(["#family", 'name', 'sire', 'dam', 'sex', 'blank'] + loci) + "\n"
        OUTFILE.write(header)
        for fam_idx, fam in enumerate(families):
            fam_name = family_names[fam_idx]
            DAM_line = "\t".join([fam_name, fam_name + "_Dam", '0', '0', '2', '0'] + ['1 1' for xx in loci]) + "\n"
            SIRE_line = "\t".join([fam_name, fam_name + "_Sire", '0', '0', '1', '0'] + ['1 2' for xx in loci]) + "\n"
            OUTFILE.write(DAM_line)
            OUTFILE.write(SIRE_line)
            for ind in fam:
                ind_info = "\t".join([fam_name, ind, fam_name + "_Sire", fam_name + "_Dam", '0', '0'])
                ind_genotypes = genotypes.loc[ind]
                OUTFILE.write(ind_info + "\t" + "\t".join([str(xx) for xx in ind_genotypes]) + "\n")

In [16]:
fams = [individuals_08, individuals_01, individuals_09]
LEPmap_filename = os.path.join('linkage_map','LEPmap', 'with_paralogs', 'all_loci.lepmap2')

In [None]:
my_genotypes = bb.transpose()
my_genotypes = my_genotypes.replace(to_replace = [np.NaN, 0, 1, 2 ], value = ['0 0', '0 0', '1 1', '1 2'])

In [19]:
my_genotypes.shape

(240, 12603)

In [26]:
pd.read_csv("linkage_map/LEPmap/with_paralogs/all_loci.lepmap", sep = '\t').shape

(246, 12522)

In [18]:
write_LEPmap(families = fams, family_names = ["fam_08", "fam_01", "fam_09"], 
             loci = loci_all, genotypes = my_genotypes, output_filename = LEPmap_filename)