In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Parsing the incredibly hard to parse LDNe output

The output is a mixture of different lengths whitespaces and symbols, and so requires some parsing....

In [15]:
phase3ldne_list = pd.read_csv("../data/Phase3.LDNe.list")
collections = phase3ldne_list['pop']
phase3ldne_list.shape

(63, 1)

In [16]:
phase3ldne_list.head()

Unnamed: 0,pop
0,AG1000G-AO.Luanda.2009.coluzzii
1,AG1000G-BF-A.Bana.2012.coluzzii
2,AG1000G-BF-A.Bana.2012.gambiae
3,AG1000G-BF-A.Pala.2012.coluzzii
4,AG1000G-BF-A.Pala.2012.gambiae


In [17]:
chroms = ['2L', '2R','3L', '3R', 'X']

Ne_Ag = dict()
Ne_chrom = dict()

for name in collections:
    for chrom in chroms:
        df = pd.read_csv(f"LDNe/Ag_LDNe_{name}.{chrom}.out", header=None, sep='\n')
        df = df[0].str.split('\s\=\s', expand=True) #split first column using equal sign
        df.columns = ['one', 'two']
        start = np.where(df.one.str.contains('Harmonic Mean'))[0][0] #find start of results
        end = len(df)-3
        df = df.iloc[start:end]                                           #subset 
        
        #loop through rows and remove multiple whitespaces to one whitespace only, then split to new columns
        res=pd.DataFrame()
        for i in range(5):
            results = ' '.join(df.iloc[i,1].split()).split()
            res = res.append(pd.Series(results),ignore_index=True)
        
        #get columns which have estimates for CIs (poorly parsed)
        df2 = df.iloc[6:9,]
        jack = df.iloc[9,0]
        df2.one = df2.one.str.replace("*", "")
        df2 = df2.one.str.split(expand=True).reset_index(drop=True)
        
        #loop through rows and remove multiple whitespaces to one whitespace only, then split to new columns
        for i in range(len(df2)):
            df2.iloc[i,0] = ' '.join(df2.iloc[i,0].split())
        
        #extract specific estimates,works only because all files are parsed identically, not ideal
        para_1 = df2.loc[0,1:4]
        para_2 = df2.loc[1,0:3]
        jack_1 = df2.loc[2,3:7]
        jack_2 = pd.Series(jack.split())
        cols = df.iloc[:5]['one']
        cols = cols.append(pd.Series(['Parametric CI - lower', 
                              'Parametric CI - upper', 
                              'Jackknife CI - lower', 
                              'Jackknife CI - upper']), ignore_index=True)
        
        #append the  estimates to the results 
        df = res.append(para_1.reset_index(drop=True)).append(para_2.reset_index(drop=True)).append(jack_1.reset_index(drop=True)).append(jack_2, ignore_index=True)
        
        #join results to the parameter column and change column names 
        final_results = pd.concat([cols,df], axis=1)
        final_results.columns = ['Parameter', 
                         'minAF_0.05', 
                         'minAF_0.02', 
                         'minAF_0.01', 
                         'minAF_0+']
        
        #store results in dict and then save within nested dict
        Ne_chrom[chrom] = final_results.iloc[:,:2] ### .iloc[:,:2] for 1 allele frequency only!
        
    Ne_Ag[name] = dict(Ne_chrom)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [18]:
Ne = pd.DataFrame()

# transpose, add population and chromosome columns, then re-order columns, and combine all tables for each
# pop into one big one
for name in collections:
    for chrom in chroms:
        
        Ne_Ag[name][chrom] = Ne_Ag[name][chrom].set_index('Parameter').T
        
        Ne_Ag[name][chrom]['name'] = name
        Ne_Ag[name][chrom]['chrom'] = chrom

                # get a list of columns
        cols = list(Ne_Ag[name][chrom])
        # move the column to head of list using index, pop and insert
        cols.insert(0, cols.pop(cols.index('name')))
        Ne_Ag[name][chrom] = Ne_Ag[name][chrom].loc[:, cols]
        cols = list(Ne_Ag[name][chrom])
        cols.insert(0, cols.pop(cols.index('chrom')))
        Ne_Ag[name][chrom] = Ne_Ag[name][chrom].loc[:, cols]
        Ne_Ag[name][chrom]
        
        Ne = pd.concat([Ne, Ne_Ag[name][chrom]])

#change names removing whitespace
Ne.columns = ['chrom', 'pop', 'sample_size', 'independent_comparisons', 
                      'overall_r^2', 'expected_r^2', 'Ne_estimate', 'Parametric_CI_lower',
                     'Parametric_CI_upper', 'Jackknife_CI_lower', 'Jackknife_CI_upper']

### Write to txt file

In [19]:
Ne.to_csv("Ne_analyses.LDNe.tsv", sep="\t", header=True, index=True, index_label='AF')

### Loading metadata

Now, lets add in all the metadata for each population, so that we can plot this on a map.

In [20]:
manifest = pd.read_csv("../../data/phase3/Ag1000g.phase3.manifest.full.tsv", sep="\t")
manifest.location = [loc.replace(" ", "") for loc in manifest.location]
manifest['pop'] = manifest[['sample_set', 'location','year', 'species_gambiae_coluzzii']].astype(str).apply('.'.join, axis=1)

We need to summarise the full manifest - essentially we want metadata info for each distinct collection we are running LDNe on, but also want AIMs and lat long info, for plotting. We first aggregate `manifest` and get the means of AIMS and long/lats, and then separately run size() on `manifest` and assign that to our new df, `manifest_summary`.

With this df, we can join the LDNe results and that will contain all we need to plot.

In [21]:
manifest_summary = manifest.groupby(['pop', 'sample_set', 'country', 'year', 'location', 'species_gambiae_coluzzii']).agg('mean').drop(columns='month').reset_index()
manifest_summary['counts'] = manifest.groupby(['pop', 'sample_set', 'year', 'location', 'species_gambiae_coluzzii']).size().reset_index(name='counts').counts

Lets write the list of pops with n > 15 to .tsv, `Phase3.LDNe.list`, so we have a list of the pops.

In [23]:
new_manifest= manifest_summary[manifest_summary.counts > 8]
Ne_manifest = new_manifest.merge(Ne)
Ne_manifest.to_csv("Ne_manifest.tsv", sep="\t", index=None)

In [26]:
Ne_manifest = pd.read_csv("Ne_manifest.tsv", sep="\t")
Ne_manifest[Ne_manifest.species_gambiae_coluzzii == 'arabiensis']

Unnamed: 0,pop,sample_set,country,year,location,species_gambiae_coluzzii,latitude,longitude,aim_fraction_colu,aim_fraction_arab,...,chrom,sample_size,independent_comparisons,overall_r^2,expected_r^2,Ne_estimate,Parametric_CI_lower,Parametric_CI_upper,Jackknife_CI_lower,Jackknife_CI_upper
220,AG1000G-KE.Kilifi.2012.arabiensis,AG1000G-KE,Kenya,2012,Kilifi,arabiensis,-3.511,39.909,0.4552,0.7423,...,2L,10.0,50045005,0.122995,0.1369,Infinite,Infinite,Infinite,Infinite,Infinite
221,AG1000G-KE.Kilifi.2012.arabiensis,AG1000G-KE,Kenya,2012,Kilifi,arabiensis,-3.511,39.909,0.4552,0.7423,...,2R,10.0,50025000,0.125041,0.1369,Infinite,Infinite,Infinite,Infinite,Infinite
222,AG1000G-KE.Kilifi.2012.arabiensis,AG1000G-KE,Kenya,2012,Kilifi,arabiensis,-3.511,39.909,0.4552,0.7423,...,3L,10.0,50025000,0.12015,0.1369,Infinite,Infinite,Infinite,Infinite,Infinite
223,AG1000G-KE.Kilifi.2012.arabiensis,AG1000G-KE,Kenya,2012,Kilifi,arabiensis,-3.511,39.909,0.4552,0.7423,...,3R,10.0,50014999,0.116373,0.1369,Infinite,Infinite,Infinite,Infinite,Infinite
224,AG1000G-KE.Kilifi.2012.arabiensis,AG1000G-KE,Kenya,2012,Kilifi,arabiensis,-3.511,39.909,0.4552,0.7423,...,X,10.0,7233305,0.069114,0.1369,Infinite,Infinite,Infinite,Infinite,Infinite
265,AG1000G-MW.Chikhwawa.2015.arabiensis,AG1000G-MW,Malawi,2015,Chikhwawa,arabiensis,-15.933,34.755,0.455293,0.745098,...,2L,41.0,39139113,0.025702,0.026288,Infinite,Infinite,Infinite,Infinite,Infinite
266,AG1000G-MW.Chikhwawa.2015.arabiensis,AG1000G-MW,Malawi,2015,Chikhwawa,arabiensis,-15.933,34.755,0.455293,0.745098,...,2R,41.0,41555274,0.026267,0.026288,Infinite,Infinite,Infinite,1285.0,Infinite
267,AG1000G-MW.Chikhwawa.2015.arabiensis,AG1000G-MW,Malawi,2015,Chikhwawa,arabiensis,-15.933,34.755,0.455293,0.745098,...,3L,41.0,42168325,0.025571,0.026288,Infinite,Infinite,Infinite,Infinite,Infinite
268,AG1000G-MW.Chikhwawa.2015.arabiensis,AG1000G-MW,Malawi,2015,Chikhwawa,arabiensis,-15.933,34.755,0.455293,0.745098,...,3R,41.0,41336760,0.025509,0.026288,Infinite,Infinite,Infinite,Infinite,Infinite
269,AG1000G-MW.Chikhwawa.2015.arabiensis,AG1000G-MW,Malawi,2015,Chikhwawa,arabiensis,-15.933,34.755,0.455293,0.745098,...,X,41.0,40603557,0.021818,0.026288,Infinite,Infinite,Infinite,Infinite,Infinite
