In [9]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import networkx as nx
import pandas as pd
from itertools import combinations
import math

CPU times: user 738 µs, sys: 475 µs, total: 1.21 ms
Wall time: 1.22 ms


In [39]:
#G = nx.DiGraph()
data = pd.read_csv("City_Trees.csv",usecols=["SPECIES","Latitude","Longitude"])
data.dropna(inplace = True)
trees = data["SPECIES"].str.split(" ", n =1, expand = True)
data["genus"] = trees[0]
data["species"] = trees[1]
#data['ID'] = range(len(data))
data.drop(columns = ["SPECIES"], inplace= True)
data = data[data.genus != 'Planting']
data = data[data.species != 'Site']
data = data[data.species != ' species']

data = data.reset_index()
#data = data.transpose()
n_attr = len(data.index.values)
attr = data.index.values
data.drop(columns = ["index"], inplace= True)
data['id'] = np.arange(len(data))
data.head()

#small, poorly sampled group to run code on
data200 = data.iloc[:200]
data200.to_csv('data200.csv')
data200.head()

Unnamed: 0,Latitude,Longitude,genus,species,id
0,37.884954,-122.278251,Prunus,cerasifera,0
1,37.884965,-122.278166,Liquidambar,styraciflua,1
2,37.884816,-122.277695,Prunus,cerasifera,2
3,37.884632,-122.277194,Aesculus,californica,3
4,37.884794,-122.275969,Platanus,hybrida,4


In [44]:
#take smaller area of data

lat_lon_bounds = {}

lat_lon_bounds['BL'] = (37.88005, -122.26978)
lat_lon_bounds['BR'] = (37.88134, -122.25939)
lat_lon_bounds['TL'] = (37.88547, -122.26999)
lat_lon_bounds['TR'] = (37.88672, -122.26042)



latitude = data['Latitude']
cleaned_lat = data[(latitude <= lat_lon_bounds['TR'][0]) & (latitude >= lat_lon_bounds["BL"][0])]

longitude = data['Longitude']

cleaned_data = cleaned_lat[(longitude <= lat_lon_bounds["BR"][1]) & (longitude >= lat_lon_bounds["TL"][1])]
cleaned_data = cleaned_data.reset_index()
cleaned_data.drop(columns = ["index"], inplace= True)
cleaned_data['id'] = np.arange(len(cleaned_data))

cleaned_data.to_csv('cleaned_data.csv')

cleaned_data.head()



Unnamed: 0,Latitude,Longitude,genus,species,id
0,37.881406,-122.269948,Ginkgo,biloba,0
1,37.881301,-122.269876,Ginkgo,biloba,1
2,37.881092,-122.269708,Ginkgo,biloba,2
3,37.881012,-122.269623,Ginkgo,biloba,3
4,37.88092,-122.269599,Ginkgo,biloba,4


In [37]:
def haversine(coord1, coord2):
    R = 6372800  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    phi1, phi2 = math.radians(lat1), math.radians(lat2) 
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))

def calc_dist(node_tuple):
    first_node = node_tuple[0]
    second_node = node_tuple[1]
    return haversine(cleaned_data.iloc[first_node][0:2], cleaned_data.iloc[second_node][0:2])

def calc_weight(node_tuple):
    #returns firstly, the weight, secondly, the calculated distance, 
    #and lastly, 2 if different genus+species, 1 if different species but same genus
    outer_ring_threshold = 100
    inner_ring_threshold = 25
    
    
    dist = calc_dist(node_tuple)
    first_node = node_tuple[0]
    second_node = node_tuple[1]
    genus1 = cleaned_data.iloc[first_node][2]
    species1 = cleaned_data.iloc[first_node][3]
    genus2 = cleaned_data.iloc[second_node][2]
    species2 = cleaned_data.iloc[second_node][3]
    
    if dist < inner_ring_threshold: 
        if genus1 != genus2:
            return 4, dist, 2
        elif species1 != species2:
            return 3, dist, 1
        else: 
            return 0, 0, 0
    elif dist < outer_ring_threshold: 
        if genus1 != genus2:
            return 2, dist, 2
        elif species1 != species2:
            return 1, dist, 1
        else: 
            return 0, 0, 0 
    else: 
        return 0, 0, 0

In [38]:
%%time
links = []

for comb_of_2_plant_ids in combinations(cleaned_data['id'], 2):
    weight, dist, dif_genus_species = calc_weight(comb_of_2_plant_ids)
    if weight != 0:
        links.append([comb_of_2_plant_ids[0], comb_of_2_plant_ids[1], weight, dist, dif_genus_species])
        
links = np.array(links)

print(links)
print(len(links))

[[  0.          12.           2.          65.4828245    2.        ]
 [  0.          47.           2.          95.64482236   2.        ]
 [  0.          48.           2.          85.71681616   2.        ]
 ...
 [838.         841.           2.          47.36687077   2.        ]
 [839.         841.           2.          48.71928612   2.        ]
 [840.         841.           2.          59.27323968   2.        ]]
16063
CPU times: user 11min, sys: 1.91 s, total: 11min 2s
Wall time: 11min 2s


In [41]:
edges = pd.DataFrame({"first_node": links[:,0], 
                      "second_node": links[:,1], 
                      "weight": links[:,2],
                      "dist": links[:,3],
                      "dif_genus_species": links[:,4]})
edges.to_csv('edges.csv')
print(edges.shape)
edges.head(20)
edges[edges.first_node==0]


(16063, 5)


Unnamed: 0,first_node,second_node,weight,dist,dif_genus_species
0,0.0,12.0,2.0,65.482824,2.0
1,0.0,47.0,2.0,95.644822,2.0
2,0.0,48.0,2.0,85.716816,2.0
3,0.0,137.0,2.0,43.735879,2.0
4,0.0,138.0,2.0,60.596898,2.0
5,0.0,139.0,2.0,67.682471,2.0
6,0.0,152.0,4.0,16.377739,2.0
7,0.0,153.0,4.0,11.451128,2.0
8,0.0,154.0,2.0,65.482824,2.0
9,0.0,186.0,2.0,64.74916,2.0


In [43]:
edges[edges.weight==1]

Unnamed: 0,first_node,second_node,weight,dist,dif_genus_species
320,18.0,104.0,1.0,89.716331,1.0
512,31.0,37.0,1.0,44.989702,1.0
513,31.0,38.0,1.0,49.418664,1.0
551,31.0,837.0,1.0,48.393914,1.0
748,37.0,42.0,1.0,68.729599,1.0
781,38.0,42.0,1.0,64.261334,1.0
984,45.0,501.0,1.0,42.078752,1.0
1010,46.0,501.0,1.0,31.115471,1.0
1076,49.0,497.0,1.0,83.305446,1.0
1077,49.0,498.0,1.0,79.989750,1.0
