In [82]:
import numpy as np
import pandas as pd
import seaborn as sns

In [31]:
samples = pd.read_csv("../../data/samples.meta.txt", sep='\t')

### Parsing the incredibly poorly written LDNe output

The output is a mixture of different lengths whitespaces, and so requires some parsing....

In [93]:
pops = samples.population.unique()
chroms = ['3L', '3R']

Ne_Ag = dict()
Ne_chrom = dict()

for pop in pops:
    for chrom in chroms:
        df = pd.read_csv(f"LDNe/Ag_LDNe_{pop}_{chrom}.out", 
                         header=None, 
                         sep='\n')
        df = df[0].str.split('\s\=\s', expand=True) #split first column using equal sign
        df.columns = ['one', 'two']
        start = np.where(df.one.str.contains('Harmonic Mean'))[0][0] #find start of results
        end = len(df)-3
        df = df.iloc[start:end]                                           #subset 
        
        #loop through rows and remove multiple whitespaces to one whitespace only, then split to new columns
        res=pd.DataFrame()
        for i in range(5):
            results = ' '.join(df.iloc[i,1].split()).split()
            res = res.append(pd.Series(results),ignore_index=True)
        
        #get columns which have estimates for CIs (poorly parsed)
        df2 = df.iloc[6:9,]
        jack = df.iloc[9,0]
        df2.one = df2.one.str.replace("*", "")
        df2 = df2.one.str.split(expand=True).reset_index(drop=True)
        
        #loop through rows and remove multiple whitespaces to one whitespace only, then split to new columns
        for i in range(len(df2)):
            df2.iloc[i,0] = ' '.join(df2.iloc[i,0].split())
        
        #extract specific estimates,works only because all files are parsed identically, not ideal
        para_1 = df2.loc[0,1:4]
        para_2 = df2.loc[1,0:3]
        jack_1 = df2.loc[2,3:7]
        jack_2 = pd.Series(jack.split())
        cols = df.iloc[:5]['one']
        cols = cols.append(pd.Series(['Parametric CI - lower', 
                              'Parametric CI - upper', 
                              'Jackknife CI - lower', 
                              'Jackknife CI - upper']), ignore_index=True)
        
        #append the  estimates to the results 
        df = res.append(para_1.reset_index(drop=True)).append(para_2.reset_index(drop=True)).append(jack_1.reset_index(drop=True)).append(jack_2, ignore_index=True)
        
        #join results to the parameter column and change column names 
        final_results = pd.concat([cols,df], axis=1)
        final_results.columns = ['Parameter', 
                         'minAF_0.05', 
                         'minAF_0.02', 
                         'minAF_0.01', 
                         'minAF_0+']
        
        #store results in dict and then save within nested dict
        Ne_chrom[chrom] = final_results
        
    Ne_Ag[pop] = dict(Ne_chrom)

In [94]:
Ne = pd.DataFrame()

# transpose, add population and chromosome columns, then re-order columns, and combine all tables for each
# pop into one big one
for pop in pops:
    for chrom in chroms:
        
        Ne_Ag[pop][chrom] = Ne_Ag[pop][chrom].set_index('Parameter').T
        
        Ne_Ag[pop][chrom]['pop'] = pop
        Ne_Ag[pop][chrom]['chrom'] = chrom

                # get a list of columns
        cols = list(Ne_Ag[pop][chrom])
        # move the column to head of list using index, pop and insert
        cols.insert(0, cols.pop(cols.index('pop')))
        Ne_Ag[pop][chrom] = Ne_Ag[pop][chrom].loc[:, cols]
        cols = list(Ne_Ag[pop][chrom])
        cols.insert(0, cols.pop(cols.index('chrom')))
        Ne_Ag[pop][chrom] = Ne_Ag[pop][chrom].loc[:, cols]
        Ne_Ag[pop][chrom]
        
        Ne = pd.concat([Ne, Ne_Ag[pop][chrom]])

In [95]:
#change names removing whitespace
Ne.columns = ['chrom', 'pop', 'sample_size', 'independent_comparisons', 
                      'overall_r^2', 'expected_r^2', 'Ne_estimate', 'Parametric_CI_lower',
                     'Parametric_CI_upper', 'Jackknife_CI_lower', 'Jackknife_CI_upper']

In [103]:
pd.options.display.max_rows = 500
Ne[(Ne.index == 'minAF_0.05') & (Ne.sample_size.astype(float) > 10)]

Unnamed: 0,chrom,pop,sample_size,independent_comparisons,overall_r^2,expected_r^2,Ne_estimate,Parametric_CI_lower,Parametric_CI_upper,Jackknife_CI_lower,Jackknife_CI_upper
minAF_0.05,3L,GHcol,55.0,6465956,0.019633,0.019242,851.2,806.9,900.7,338.5,Infinite
minAF_0.05,3R,GHcol,55.0,6705702,0.019729,0.01924,680.4,652.1,711.2,224.3,Infinite
minAF_0.05,3L,GHgam,12.0,13844967,0.109337,0.10824,279.1,259.7,301.6,98.9,Infinite
minAF_0.05,3R,GHgam,12.0,13760829,0.110819,0.10824,117.7,114.0,121.7,34.1,Infinite
minAF_0.05,3L,BFgam,92.0,2357246,0.011382,0.011249,2503.3,2167.7,2960.1,1317.0,21833.9
minAF_0.05,3R,BFgam,92.0,2520667,0.011443,0.011247,1699.6,1541.9,1892.6,1040.1,4508.3
minAF_0.05,3L,BFcol,75.0,3364929,0.0139,0.013903,Infinite,17897.2,Infinite,10892.7,Infinite
minAF_0.05,3R,BFcol,75.0,3173472,0.013897,0.013902,Infinite,19659.2,Infinite,9299.5,Infinite
minAF_0.05,3L,UGgam,112.0,2733796,0.009316,0.009184,2515.7,2249.7,2851.9,1960.6,3500.0
minAF_0.05,3R,UGgam,112.0,2759529,0.009329,0.009184,2304.8,2080.2,2583.0,1798.6,3198.8


### Write to txt file

In [104]:
Ne.to_csv("Ne_analyses.LDNe", sep="\t", header=True)