In [1]:
import os

import pandas as pd
import numpy as np

import geocoder

In [2]:
fn = "Cucurbitae_Ethanol_collections_PBARC_SELECTIONS_Feb2016_SIM_selections_ms.xlsx"
data_directory = "../data/"

In [3]:
# read in csv's from dapc analisis in R
fns = {fn.strip().split(".csv")[0]:fn for fn in os.listdir(data_directory) if fn in ["assign.csv", "eig.csv", "ind.coord.csv", "posterior.csv", "grp.csv"]}
dfs = {n:pd.read_csv(data_directory + fn) for n,fn in fns.items()}

for n,df in dfs.items():
    df.rename(columns={df.columns[0]: 'key'}, inplace=True)

posterior = dfs["posterior"].rename(columns=lambda x: x.split(".")[-1] if "posterior." in x else x)
posterior = posterior.join(dfs["assign"]["assign"])
posterior["posterior_assign"] = posterior.apply(lambda row: row[row["assign"]], axis=1)
posterior = posterior.join(dfs["grp"]["grp"])
posterior["posterior_grp"] = posterior.apply(lambda row: row[row["grp"]], axis=1)

df = dfs["ind.coord"].rename(columns=lambda x: x.split(".")[-1] if "ind.coord." in x else x)
df = df.join([dfs["assign"]["assign"], dfs["grp"]["grp"],
              posterior["posterior_assign"], posterior["posterior_grp"]])

In [4]:
# read in excel sheets w/ location info
s1 = pd.read_excel(fn, sheetname="Sheet1")
s2 = pd.read_excel(fn, sheetname="Sheet2")

In [5]:
# pull "MS" number from end of sample names in order to cross reference with sheet2
df["MS"] = df["key"].apply(lambda x: int(x.split("_")[-1]) if x.split("_")[-1].isdigit() else -1)

In [6]:
# function which uses "MS" number and sheet2 to retreve code
def code(ms):
    for index, row in s2.iterrows():
        if row["MS_start"] <= ms <= row["MS_end"]:
            return int(row["Code"])
    return -1

In [7]:
# retreve code for each sample
df["Code"] = df["MS"].apply(code)

In [8]:
df = df.sort_values(["Code", "key"])

In [9]:
# merge data from sheet1 and sheet2 to sample data set
df = df.merge(s2[["Code", "Country/Territory", "Island/Province/State"]], on="Code", how="left")
df = df.merge(s1[["Code", "Locality", "Decimal Latitude", "Decimal Longitude", "Elevation", "Date collected", "Collector", "Attractant", "Sex", "Comments", "Comments 2"]], on="Code", how="left")

In [10]:
# for the samples missing lat/lng that have Country/Territory information, look up lat/lng of Country/Territory
g_df = df.loc[(pd.isnull(df["Decimal Latitude"])) & (pd.notnull(df["Country/Territory"]))]
g = {name:geocoder.google(name).latlng for name in g_df["Country/Territory"].unique()}

In [11]:
# write new lat/lng info to sample data set
df["Decimal Latitude"] = df.apply(lambda x: g[x["Country/Territory"]][0]
                                  if (pd.isnull(x["Decimal Latitude"]) & pd.notnull(x["Country/Territory"]))
                                  else x["Decimal Latitude"], axis=1)

df["Decimal Longitude"] = df.apply(lambda x: g[x["Country/Territory"]][1]
                                   if (pd.isnull(x["Decimal Longitude"]) & pd.notnull(x["Country/Territory"]))
                                   else x["Decimal Longitude"], axis=1)

In [12]:
# cast to float
df["Decimal Latitude"] = df["Decimal Latitude"].astype("float64")
df["Decimal Longitude"] = df["Decimal Longitude"].astype("float64")

In [13]:
# clean up nulls
df = df.applymap(lambda x: None if pd.isnull(x) else x)

In [14]:
out_df = df[["key", "Decimal Latitude", "Decimal Longitude"]].dropna(axis='index')

In [15]:
out_df.to_csv(data_directory + "location.csv", index=False)