In [None]:
import simulated_data as sim
import pandas as pd

### TODO
* Make a genome load function
* Make it possible to save to different directory

## Frequency Datasets
### Create a simulated dataset

In [None]:
# set seed > 0 to enforce seeding
# set save=True to save result to file
data = sim.make_fake_frequencies(2, 500, seed=0, save=False)
data.head()

### Load an existing dataset

In [None]:
data1 = sim.load_fake_frequencies(2,100,518)
data1.head()

### Convert between formats
1. Collapse population

In [None]:
# if column names are in format e.g. EUR_A use pop_is_suffix=False
data_long = sim.freq_collapse_pop_to_column(data, pop_is_suffix=True)
data_long.head()

2. Revert to original

In [None]:
# Note this will automatically make the population a suffix (due to merge)
data_original = sim.freq_one_row_per_position(data_long)
data_original.head()

## Simulated genomes

### Simulate perfect genomes
Perfect genomes are defined as those containing only the most frequent allele for given position in the given ancestry. 

In [None]:
perfect_G = sim.simulate_perfect_genomes(["0","1"], data, save=False)
perfect_G.keys()

### Simulate genomes from distributions

In [None]:
sim_genome = sim.simulate_random_genomes(["0", "0"], data, save=False)
sim_genome[0:5]

### Convert between formats
Simulation functions return genome as pairs of nucleotides but save as a table with ancestry information. These tables can be converted to genome format

In [None]:
saved_genome = pd.read_csv("Data/simGenome_100_0_0.tsv", sep="\t", index_col=False)
saved_genome.head()

In [None]:
genome = sim.table_to_genome(saved_genome)
genome[0:5]

In [None]:
back_to_table = sim.genome_to_table(["0", "0"], genome, data1)
back_to_table.head()

## Admixed genomes

In [None]:
x1 = sim.random_admixed_genome(["0", "1"], data_long, 1, save=False)

In [None]:
x2 = sim.random_admixed_genome(["0", "1"], data_long, 2, save=False)

In [None]:
x10 = sim.random_admixed_genome(["0", "1"], data_long, 10, save=False)

### 1000 Genomes

In [None]:
def load_1000g_emissions(filename):
    data = pd.read_csv(filename, usecols=[1,7,8,9,10,11,12,13, 14], index_col=False)
    return data

In [None]:
g = load_1000g_emissions("../../chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.bybp.csv")
g.head()

In [None]:
g_long = sim.freq_collapse_pop_to_column(g, pop_is_suffix=False)
g_long.head()

In [None]:
#ad1 = sim.random_admixed_genome(["AFR_", "EUR_"], g_long, 1, save=False)
# Need to make this more efficient for large data

### Admix from existing chromosomes (1000G)

In [None]:
g1 = pd.read_csv("../../chromosome_21_files/NA12878.EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted.tsv", sep="\t", index_col=False)
g1["POP1"] = "EUR"
g1["POP2"] = "EUR"
g2 = pd.read_csv("../../chromosome_21_files/HG01879.AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.formatted.tsv", sep="\t", index_col=False)
g2["POP1"] = "AFR"
g2["POP2"] = "AFR"

In [None]:
g2.loc[g2.duplicated(subset="POS", keep=False)].sort_values(by="POS").head(20)

In [None]:
mix1 = sim.admix_two_genomes_full(g1, g2, 1, "")

In [None]:
mix1.head()