In [2]:
import simulated_data as sim
import pandas as pd

### TODO
* Make a genome load function
* Make it possible to save to different directory

## Frequency Datasets
### Create a simulated dataset

In [2]:
# set seed > 0 to enforce seeding
# set save=True to save result to file
data = sim.make_fake_frequencies(2, 5000, seed=531, save=True)
data.head()

100%|██████████| 5000/5000 [00:20<00:00, 248.60it/s]


Unnamed: 0,POS,A0,C0,G0,T0,A1,C1,G1,T1
0,0,0.0,0.5567,0.4433,0.0,0.0,0.6487,0.3513,0.0
1,1,0.4619,0.0,0.0,0.5381,0.288,0.0,0.0,0.712
2,2,0.0,0.8425,0.1575,0.0,0.0,0.3347,0.6653,0.0
3,3,0.2744,0.0,0.7256,0.0,0.9542,0.0,0.0458,0.0
4,4,0.0,0.0,0.3972,0.6028,0.0,0.0,0.4926,0.5074


### Load an existing dataset

In [3]:
data1 = sim.load_fake_frequencies(2,5000,531)
data1.shape

File: Data/simData_N2_P5000_seed531.tsv does not exist, please generate it first.


AttributeError: 'NoneType' object has no attribute 'shape'

### Convert between formats
1. Collapse population

In [None]:
# if column names are in format e.g. EUR_A use pop_is_suffix=False
data_long = sim.freq_collapse_pop_to_column(data, pop_is_suffix=True)
data_long.head()

2. Revert to original

In [None]:
# Note this will automatically make the population a suffix (due to merge)
data_original = sim.freq_one_row_per_position(data_long)
data_original.head()

## Simulated genomes

### Simulate perfect genomes
Perfect genomes are defined as those containing only the most frequent allele for given position in the given ancestry. 

In [4]:
perfect_G = sim.simulate_perfect_genomes(["0","1"], data, save=True)
perfect_G.keys()

dict_keys([('0', '0'), ('0', '1'), ('1', '0'), ('1', '1')])

### Simulate genomes from distributions

In [None]:
sim_genome = sim.simulate_random_genome(["0", "0"], data, save=False)
sim_genome[0:5]

In [6]:
sim_genome = sim.simulate_random_genome(["0", "0"], data, save=True)
sim_genome = sim.simulate_random_genome(["0", "1"], data, save=True)
sim_genome = sim.simulate_random_genome(["1", "0"], data, save=True)
sim_genome = sim.simulate_random_genome(["1", "1"], data, save=True)

### Convert between formats
Simulation functions return genome as pairs of nucleotides but save as a table with ancestry information. These tables can be converted to genome format

In [None]:
saved_genome = pd.read_csv("Data/simGenome_100_0_0.tsv", sep="\t", index_col=False)
saved_genome.head()

In [None]:
genome = sim.table_to_genome(saved_genome)
genome[0:5]

In [None]:
back_to_table = sim.genome_to_table(["0", "0"], genome, data1)
back_to_table.head()

## Admixed genomes

In [6]:
data = pd.read_csv("../../simulated_files/simData_N2_P5000_seed531.tsv", sep = "\t", index_col=False)
data_long = sim.freq_collapse_pop_to_column(data, pop_is_suffix=True)
data_long.head()

Unnamed: 0,POS,A,C,G,T,POP
0,0,0.0,0.5567,0.4433,0.0,0
0,0,0.0,0.6487,0.3513,0.0,1
1,1,0.4619,0.0,0.0,0.5381,0
1,1,0.288,0.0,0.0,0.712,1
2,2,0.0,0.8425,0.1575,0.0,0


In [None]:
x1 = sim.random_admixed_genome(["0", "1"], data_long, 1, save=False)

In [None]:
x2 = sim.random_admixed_genome(["0", "1"], data_long, 2, save=False)

In [None]:
x10 = sim.random_admixed_genome(["0", "1"], data_long, 10, save=False)

In [7]:
data_long =  sim.freq_collapse_pop_to_column(data, pop_is_suffix=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 1, sample_id="0", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 1, sample_id="1", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 2, sample_id="0", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 2, sample_id="1", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 4, sample_id="0", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 4, sample_id="1", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 5, sample_id="0", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 5, sample_id="1", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 10, sample_id="0", save=True)
x = sim.random_admixed_genome(["0", "1"], data_long, 10, sample_id="1", save=True)

### Admix from perfect genomes

### 1000 Genomes

In [None]:
def load_1000g_emissions(filename):
    data = pd.read_csv(filename, usecols=[1,7,8,9,10,11,12,13, 14], index_col=False)
    return data

In [None]:
g = load_1000g_emissions("../../chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.bybp.csv")
g.head()

In [None]:
g_long = sim.freq_collapse_pop_to_column(g, pop_is_suffix=False)
g_long.head()

In [None]:
#ad1 = sim.random_admixed_genome(["AFR_", "EUR_"], g_long, 1, save=False)
# Need to make this more efficient for large data

### Admix from existing chromosomes (1000G)

In [9]:
g1 = pd.read_csv("../../simulated_files/simGenome_perfect_5000_0_0.tsv", sep = "\t", index_col = False)
g2 = pd.read_csv("../../simulated_files/simGenome_perfect_5000_1_1.tsv", sep = "\t", index_col = False)


In [14]:
for repeat in ["a", "b", "c"]:
    _ = sim.admix_two_genomes_full(g1, g2, 1, "../../simulated_files/simAdmixPerfectGenomes00_11_5000_Rx1_"+repeat+".tsv", save=True)

In [15]:
for repeat in ["a", "b", "c"]:
    _ = sim.admix_two_genomes_full(g1, g2, 3, "../../simulated_files/simAdmixPerfectGenomes00_11_5000_Rx3_"+repeat+".tsv", save=True)

In [16]:
for repeat in ["a", "b", "c"]:
    _ = sim.admix_two_genomes_full(g1, g2, 5, "../../simulated_files/simAdmixPerfectGenomes00_11_5000_Rx5_"+repeat+".tsv", save=True)

In [17]:
for repeat in ["a", "b", "c"]:
    _ = sim.admix_two_genomes_full(g1, g2, 10, "../../simulated_files/simAdmixPerfectGenomes00_11_5000_Rx10_"+repeat+".tsv", save=True)

#### Chromosome 21

In [18]:
g1 = pd.read_csv("../../chromosome_21_files/NA12878.EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted_geno.tsv", sep="\t", index_col=False)
g2 = pd.read_csv("../../chromosome_21_files/HG01879.AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted_geno.tsv", sep="\t", index_col=False)

In [19]:
g2.head()

Unnamed: 0,POS,A1,A2,POP1,POP2
0,9411239,A,A,0,0
1,9411245,A,A,0,0
2,9411264,C,C,0,0
3,9411267,T,T,0,0
4,9411302,T,T,0,0


In [20]:
g2.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
g1.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)

In [21]:
for repeat in ["a", "b", "c"]:
    mix1 = sim.admix_two_genomes_full(g1, g2, 4, "../../simulated_files/admixEUR_AFR_chr21_Rx4_"+repeat+".tsv", save=True)

In [None]:
for repeat in ["a", "b", "c"]:
    mix1 = sim.admix_two_genomes_full(g1, g2, 3, "../../simulated_files/admixEUR_AFR_chr21_Rx3_"+repeat+".tsv", save=True)

#### Chromosome 14

In [None]:
g1 = pd.read_csv("../../chromosome_14_files/NA12878.EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted_geno.tsv", sep="\t", index_col=False)
g2 = pd.read_csv("../../chromosome_14_files/HG01879.AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted_geno.tsv", sep="\t", index_col=False)

In [None]:
g2.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
g1.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)

In [None]:
for repeat in ["a", "b", "c"]:
    mix1 = sim.admix_two_genomes_full(g1, g2, 1, "../../simulated_files/admixEUR_AFR_chr14_Rx1_"+repeat+".tsv", save=True)

In [None]:
for repeat in ["a", "b", "c"]:
    mix1 = sim.admix_two_genomes_full(g1, g2, 3, "../../simulated_files/admixEUR_AFR_chr14_Rx3_"+repeat+".tsv", save=True)

#### Two-step admix chr21

In [None]:
g1 = pd.read_csv("../../chromosome_21_files/NA12878.EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted_geno.tsv", sep="\t", index_col=False)
g2 = pd.read_csv("../../simulated_files/admixEUR_AFR_chr21_Rx1_a.tsv", sep="\t", index_col=False)
g2.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
g1.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
for repeat in ["a", "b", "c"]:
    mix1 = sim.admix_two_genomes_full(g1, g2, 1, "../../simulated_files/admixEUR_Rx1a_chr21_Rx1_"+repeat+".tsv", save=True)

In [None]:
g1 = pd.read_csv("../../simulated_files/admixEUR_AFR_chr21_Rx1_c.tsv", sep="\t", index_col=False)
g2 = pd.read_csv("../../simulated_files/admixEUR_AFR_chr21_Rx3_b.tsv", sep="\t", index_col=False)
g2.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
g1.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
for repeat in ["a", "b", "c"]:
    mix1 = sim.admix_two_genomes_full(g1, g2, 1, "../../simulated_files/admixRx1c_Rx3b_chr21_Rx1_"+repeat+".tsv", save=True)

## Accuracy

In [None]:
import pandas as pd
import numpy as np
def accuracy(predict_geno, truth_geno):
    n_pos = len(predict_geno)
    p1 = predict_geno.POP1.to_numpy()
    p2 = predict_geno.POP2.to_numpy()
    t1 = truth_geno.POP1.to_numpy()
    t2 = truth_geno.POP2.to_numpy()
    #print(type(t2))
    correct1 = np.equal(p1, t1).sum()
    correct2 = np.equal(p2, t2).sum()
    accuracy = (correct1 + correct2) / (2 * n_pos)
    return accuracy

In [None]:
g1 = pd.read_csv("../../simulated_files/admixEUR_AFR_chr21_Rx1_c.tsv", sep="\t", index_col=False)
g2 = pd.read_csv("../../simulated_files/admixEUR_AFR_chr21_Rx3_b.tsv", sep="\t", index_col=False)
g2.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
g1.drop_duplicates(subset="POS", keep=False, inplace=True, ignore_index=True)
accuracy(g1, g2)