In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as op
import emcee
import scipy.stats as ss
import collections
import corner
import seaborn as sns
import random
pd.options.mode.chained_assignment = None
%matplotlib inline

In [5]:
import pandas as pd
import numpy as np
import collections


In [6]:
df=pd.read_csv("NED27.09.1-D-14.2.0-20170929.csv",skiprows=12)
df=df[np.isnan(df['redshift (z)'])] # only measurements without redshift data are useful here

In [7]:
def selectdata(mymethod,df):
    dfa=df[~np.isfinite(df.err)] # database of non reported errors
    df1=df[np.isfinite(df.err)] # remove measurements that do not report an error
# Create a dataframe with methods and number of occurrences
    counterM=collections.Counter(list(df1.Method)) # count measurements p. method
    dfmeth=(pd.DataFrame.from_dict(counterM,orient="index").reset_index()).rename(columns={'index':'method', 0:'count'})
    dfmeth=dfmeth.sort_values(by='count',ascending=False) # need to rewrite?
# Select a method for analysis
    df1=df1[df1.Method==mymethod] # choose a method
    namelist=list(df1['Galaxy ID']) # list of galaxies
    counter=collections.Counter(namelist) # count measurements per galaxy
# Select galaxies with a minimum number of measurements
    ulist=[]
    ulist2=[]
    nmeas=1
    for i in counter.keys():
        if counter[i]>nmeas:
            ulist+=[i] # all galaxies with more than n_meas measurements
        if counter[i]>=1:
            ulist2+=[i] # all galaxies with at least one measurement w/a reported error
    print('No. of Galaxies with reported errors is %i' % len(ulist2) )
    print('No. of Galaxies with more than %i measurements is %i' % (nmeas,len(ulist)) )
# Create database for bootstrap, remove unnecessary columns
    dfs=df1[np.in1d(df1['Galaxy ID'],ulist)] # dataframe with galaxies with more than nmeas measurements
    colu=list(df.columns)
    for i in ['Galaxy ID', 'm-M', 'err', 'D (Mpc)']:
        colu.remove(i)
    dfs.drop(colu, inplace=True, axis=1)
# Create database for non-reported errors
    df1a=dfa[dfa.Method==mymethod] # all measurements without reported errors
    elist=list(np.unique(df1a['Galaxy ID'])) # list of galaxies with measurements without reported errors
    dmeana=[]
    for i in elist:
        dfilter=np.in1d(df1a['Galaxy ID'],i)
        dummy=df1a[dfilter]
        dmeana+=[dummy['D (Mpc)'].mean()] # average reported distance (D), no need for weighted... there are no weights
    filtr=~np.in1d(elist,ulist2)
    nulista=np.asarray(elist)[filtr] # list of galaxies without reported errors
    dista=np.asarray(dmeana)[filtr] # mean distance of galaxies without reported errors
    print('No. of Galaxies without reported errors is %i' % len(nulista) )
    return ulist,dfs,nulista,dista,dfmeth,len(nulista),len(ulist),len(ulist2)

In [8]:
mymethod='Tully-Fisher'
ulist,dfs,nulista,dista,dfmeth,*mma=selectdata(mymethod,df)

No. of Galaxies with reported errors is 11376
No. of Galaxies with more than 1 measurements is 9100


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


No. of Galaxies without reported errors is 818


In [9]:
dfmeth.head()

Unnamed: 0,method,count
0,FP,130633
1,Tully-Fisher,56397
22,RR Lyrae,46152
7,Cepheids,20065
2,SNIa,8306


# Method Selection (can be skipped)

We look for the methods for which:

- The fraction of error-not-reported galaxies to useful measurements (no. of galaxies with n_meas>1) is greater than 5
- The number of galaxies with n_meas>1 is greater than 20.

These methods are in need of error prediction models.

In [10]:
meas=[]
for i in dfmeth.method:
    ulist,dfs,nulista,dista,dfmeth,*mma=selectdata(i,df);
    meas+=[mma]

No. of Galaxies with reported errors is 129054
No. of Galaxies with more than 1 measurements is 1218


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


No. of Galaxies without reported errors is 25
No. of Galaxies with reported errors is 11376
No. of Galaxies with more than 1 measurements is 9100
No. of Galaxies without reported errors is 818
No. of Galaxies with reported errors is 22930
No. of Galaxies with more than 1 measurements is 40
No. of Galaxies without reported errors is 54
No. of Galaxies with reported errors is 9239
No. of Galaxies with more than 1 measurements is 100
No. of Galaxies without reported errors is 5
No. of Galaxies with reported errors is 2714
No. of Galaxies with more than 1 measurements is 2296
No. of Galaxies without reported errors is 37
No. of Galaxies with reported errors is 2438
No. of Galaxies with more than 1 measurements is 404
No. of Galaxies without reported errors is 0
No. of Galaxies with reported errors is 562
No. of Galaxies with more than 1 measurements is 548
No. of Galaxies without reported errors is 2
No. of Galaxies with reported errors is 547
No. of Galaxies with more than 1 measurements 

In [11]:
meas=np.array(meas)
dfmeth['empty']=meas[:,0]
dfmeth['useful']=meas[:,1]
dfmeth['total']=meas[:,2]
dfmeth['e/u']=dfmeth['empty']/dfmeth['useful']*100
dfmeth['e/t']=dfmeth['empty']/dfmeth['total']*100

In [12]:
dfmeth[(dfmeth['e/u']>5)&(dfmeth['useful']>20)]

Unnamed: 0,method,count,empty,useful,total,e/u,e/t
1,Tully-Fisher,56397,818,9100,11376,8.989011,7.190577
22,RR Lyrae,46152,54,40,22930,135.0,0.235499
11,TRGB,1710,22,308,462,7.142857,4.761905
8,CMD,1209,105,113,405,92.920354,25.925926
38,Eclipsing Binary,256,46,47,58,97.87234,79.310345
33,Red Clump,230,12,23,38,52.173913,31.578947
10,PNLF,198,8,45,69,17.777778,11.594203
23,SZ effect,137,10,38,40,26.315789,25.0
6,Brightest Stars,128,92,21,82,438.095238,112.195122
29,Horizontal Branch,96,9,21,50,42.857143,18.0


In [13]:
# number of galaxies with no reported errors for this method
mymethod='Tully-Fisher'
ulist,dfs,nulista,dista,dfmeth,*mma=selectdata(mymethod,df)

No. of Galaxies with reported errors is 11376
No. of Galaxies with more than 1 measurements is 9100


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


No. of Galaxies without reported errors is 818


In [18]:
mma

[818, 9100, 11376]

In [None]:
# en nulista estan las que no tienen error

In [24]:
df[df['Galaxy ID']==nulista

Unnamed: 0,Exclusion Code,D,G,Galaxy ID,m-M,err,D (Mpc),Method,REFCODE,SN ID,redshift (z),Hubble const.,Adopted LMC modulus,Date (Yr. - 1980),Notes
119742,,78254,16337,2MASX J07300813-2201060,30.23,,11.1,Tully-Fisher,2000AJ....120.1876H,,,,,20,


In [28]:
dfempty=df[np.in1d(df['Galaxy ID'],nulista)]

In [35]:
counter=collections.Counter(dfempty['Galaxy ID'])

In [63]:
dfempme=pd.DataFrame(list(counter.items()),columns=['gal','meas'])

In [66]:
(dfempme.meas==1).sum() # number of single-measurement TF reported distances without error

357