In [1]:
import os

import pandas as pd
import numpy as np

data_directory = "../data/"

In [2]:
# read in csv's from dapc analisis in R
fns = {fn.strip().split(".csv")[0]:fn for fn in os.listdir(data_directory) if fn in ["assign.csv", "eig.csv", "ind.coord.csv", "posterior.csv", "grp.csv"]}
dfs = {n:pd.read_csv(data_directory + fn) for n,fn in fns.items()}

# rename 0th column to key
for n,df in dfs.items():
    df.rename(columns={df.columns[0]: 'key'}, inplace=True)
    df.key = df.key.apply(str)

# extract posterior probabilities for a priori group (grp) and model predicted group (assign)
posterior = dfs["posterior"].rename(columns=lambda x: x.split(".")[-1] if "posterior." in x else x)
posterior = posterior.join(dfs["assign"]["assign"].apply(str))
posterior["posterior_assign"] = posterior.apply(lambda row: row.loc[row["assign"]], axis=1)
posterior = posterior.join(dfs["grp"]["grp"].apply(str))
posterior["posterior_grp"] = posterior.apply(lambda row: row.loc[row["grp"]], axis=1)

# append posterior probabilities to principle components 
df = dfs["ind.coord"].rename(columns=lambda x: x.split(".")[-1] if "ind.coord." in x else x)
df = df.join([dfs["assign"]["assign"], dfs["grp"]["grp"],
              posterior["posterior_assign"], posterior["posterior_grp"]])

In [3]:
df.describe()

Unnamed: 0,LD1,LD2,LD3,LD4,LD5,LD6,LD7,LD8,LD9,LD10,...,LD15,LD16,LD17,LD18,LD19,LD20,assign,grp,posterior_assign,posterior_grp
count,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,...,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0
mean,-3.923918e-16,3.228969e-16,3.244065e-16,3.1781190000000003e-17,1.601772e-16,1.449752e-16,-6.174556e-16,1.877209e-16,2.0339960000000002e-17,-3.034045e-16,...,-2.1717150000000003e-17,-2.5477920000000002e-17,-6.203954e-18,-2.976838e-17,1.2533710000000001e-17,1.039543e-16,235.920802,235.604962,0.793362,0.7104243
std,8.247994,6.140401,5.498636,3.675952,3.569485,2.846577,2.558324,2.264867,2.047382,1.943399,...,1.42459,1.353318,1.274916,1.168159,1.051619,0.9955884,258.699492,258.644594,0.236383,0.3401841
min,-26.82652,-11.55498,-17.57824,-14.13616,-15.77059,-15.15996,-5.954851,-11.82961,-9.26727,-10.7066,...,-6.560919,-6.504336,-5.943433,-3.538966,-6.65756,-5.257754,20.0,20.0,0.204231,2.6295369999999997e-64
25%,-6.458708,-4.750833,-1.918091,-0.7915317,-0.5840434,-0.6319228,-1.519369,-1.038594,-0.9396628,-1.030729,...,-0.8162595,-0.7048084,-0.6531723,-0.7010926,-0.6057507,-0.5159162,37.0,37.0,0.58598,0.4351107
50%,3.574865,1.121731,0.8074659,0.4990305,0.4046429,0.02213873,-0.3586744,8.662608e-05,0.03866832,0.0006942201,...,-0.1083066,-0.02327926,-0.006022041,-0.03552313,0.02910977,0.002137582,59.0,59.0,0.902454,0.8865626
75%,6.146246,3.08932,4.129344,1.538923,1.329386,0.6854036,1.073168,1.130234,1.109968,1.088814,...,0.6576244,0.7343999,0.7045135,0.6636916,0.6011784,0.562375,489.0,489.0,1.0,1.0
max,11.3417,19.88885,7.756025,15.5067,8.900155,15.20522,13.26288,7.188486,7.165217,6.560073,...,9.034167,7.902448,6.397609,6.808722,4.981931,3.867066,699.0,699.0,1.0,1.0


In [4]:
data_dir = "Rosenberg_etal_2005_data/"
s1 = pd.read_csv(data_dir + "msatData-Table 1.csv")
s2 = pd.read_csv(data_dir + "latLongs-Table 1.csv")
s3 = pd.read_csv(data_dir + "popCodes-Table 1.csv")

In [5]:
loc_df = s1.rename(columns={s1.columns[0]: 'key',
                            s1.columns[2]: 'Population'})[["key", "Population"]]

loc_df = loc_df.merge(s2[["Population",
                          "Latitude[DegreesNorth]",
                          "Longitude[DegreesEast]"]],on="Population", how="left")

loc_df.rename(columns={"Latitude[DegreesNorth]": 'Lat',
                       "Longitude[DegreesEast]": 'Lng'}, inplace=True)

loc_df.drop_duplicates(keep="first", inplace=True)

In [6]:
loc_df.key = loc_df.key.apply(str)

In [7]:
loc_df.to_csv(data_directory + "location.csv", index=False)

In [8]:
# append location information
loc_df = pd.read_csv(data_directory + "location.csv")
loc_df.key = loc_df.key.apply(str)
df = df.merge(loc_df, on="key", how="left")

In [9]:
# clean up nulls
df = df.applymap(lambda x: "NaN" if pd.isnull(x) else x)

# output
df.to_csv(data_directory + "webapp_data.csv", index=False)

In [10]:
df

Unnamed: 0,key,LD1,LD2,LD3,LD4,LD5,LD6,LD7,LD8,LD9,...,LD18,LD19,LD20,assign,grp,posterior_assign,posterior_grp,Population,Lat,Lng
0,794,5.865279,3.498581,5.854815,1.043006,-1.123781,-0.080276,-2.087655,-1.307714,-0.400602,...,0.560493,-1.404661,2.848685,20,20,0.868316,0.868316,Orcadian,59.000000,-3.0
1,795,6.151227,2.876183,6.529029,0.837831,-0.473697,0.001962,-1.354424,-1.161427,-2.514067,...,-0.022541,-0.515146,-1.452931,20,20,0.666932,0.666932,Orcadian,59.000000,-3.0
2,796,6.109458,2.138025,5.322405,1.362387,-0.152780,0.264388,-1.449296,-0.558078,-1.701561,...,0.958539,0.493875,1.373702,20,20,0.562439,0.562439,Orcadian,59.000000,-3.0
3,797,6.102600,2.018505,4.402570,1.149202,1.476076,0.257387,-1.324815,-3.103126,-1.999333,...,1.159240,0.215118,-0.322193,20,20,0.836591,0.836591,Orcadian,59.000000,-3.0
4,798,5.937751,2.106528,4.865443,-0.764055,1.503806,0.564973,-1.418611,-1.349226,-0.895305,...,1.071326,-0.288673,0.127720,25,20,0.533325,0.393725,Orcadian,59.000000,-3.0
5,799,4.478729,2.107895,5.756928,0.735748,1.358837,-0.385104,-0.733292,-0.812934,-2.063902,...,1.114156,-0.835806,-0.429817,20,20,0.930991,0.930991,Orcadian,59.000000,-3.0
6,800,5.024226,1.459052,6.005807,0.395260,0.312436,0.631585,-3.381593,-1.577860,-0.374384,...,-0.263939,-0.751739,-0.318401,24,20,0.591706,0.065466,Orcadian,59.000000,-3.0
7,801,5.770242,3.087431,6.105993,0.781554,-0.410766,0.416213,-1.621044,-1.705281,-0.938055,...,0.594528,-1.629717,2.408342,20,20,0.962167,0.962167,Orcadian,59.000000,-3.0
8,802,5.132496,1.480641,5.802434,1.215560,1.199861,-0.077394,-3.839535,-2.932975,-0.692983,...,0.508034,0.265618,0.685558,24,20,0.433731,0.072834,Orcadian,59.000000,-3.0
9,803,4.551920,1.590601,6.285587,0.925822,0.106170,-1.095312,0.342518,-1.453044,-0.552297,...,0.425220,-0.035033,0.645850,25,20,0.557430,0.378158,Orcadian,59.000000,-3.0
