<b>MSDS697 Spring I 2023 - Final Project</b><br>
<b>Group 15</b> - Project Bears<br>
<b>Members</b> - Sharon Dodda, Ensun Pak

This notebook is only used to encode the labels of the training dataset. This cannot be part of the ML pipeline, because we are taking only the historical data as training data and to encode the data with NLCD takes too long to be practical.

In [None]:
# Import packages
from pyspark.sql.functions import *
import pandas as pd
import os



In [None]:
# Initialize parameters to connect to MongoDB Atlas
mongo_username = os.environ.get("MONGO_USERNAME")
mongo_password =  os.environ.get("MONGO_PASSWORD")
mongo_ip_address = os.environ.get("MONGO_IP")
database_name = os.environ.get("MONGO_DB_NAME")

connectionString=f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}"

Process base training data

In [None]:
# Get the bears collections from MongoDB
database="msds697_bears"
collection="inat_historical"

inat_historical = spark.read.format("mongo")\
                    .option("database", database)\
                    .option("spark.mongodb.input.uri", connectionString)\
                    .option("collection", collection).load()

In [None]:
# Reorganize layout of pyspark df
inat_historical = inat_historical.withColumn('lat', col('coords')[0]).withColumn('lon', col('coords')[1])
bears_df = inat_historical.select('coords',
                                  'county',
                                  'observed_period.date_unit.day',
                                  'observed_period.date_unit.month',
                                  'observed_period.date_unit.year')

# Convert pyspark DF to pandas DF
bears_df = bears_df.toPandas()

In [None]:
def dist_btw_2_coords(coord1, coord2):
    """
    Calculates the Euclidean distance between two coordinate points.
    """
    from math import sin, cos, sqrt, atan2, radians
    
    # Approximate radius of earth in km
    R = 6373.0


    lat1 = radians(coord1[0])
    lon1 = radians(coord1[1])
    lat2 = radians(coord2[0])
    lon2 = radians(coord2[1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance_in_miles = R * c * 0.621371

    # print("Result: ", distance_in_miles)
    
    return distance_in_miles

In [None]:
# Calculate the target label for more than 1 bears sighted within 2 miles of each other
bears_df['label'] = ''
for i in range(bears_df.shape[0]):
    if (i < bears_df.shape[0]):
        for j in range(i+1, bears_df.shape[0]):
            dist = dist_btw_2_coords(bears_df.iloc[i,0], bears_df.iloc[j,0])
            if dist <= 2:
                bears_df.iloc[i,5] = 1
                break
        else:
            bears_df.iloc[i,5] = 0

In [None]:
# Extract latitude and longitude data, and truncate their decimal points to 5 spaces
bears_df['lat'] = bears_df.apply(lambda x: int(x['coords'][0] * 100000) / 100000, axis=1)
bears_df['lon'] = bears_df.apply(lambda x: int(x['coords'][1] * 100000) / 100000, axis=1)

Add NLCD land code to the historical training data

In [None]:
# Get the bears collections from MongoDB
database="msds697_bears"
collection="nlcd_ca_data"

nlcd_ca = spark.read.format("mongo")\
                    .option("database", database)\
                    .option("spark.mongodb.input.uri", connectionString)\
                    .option("collection", collection).load()

# Reorganize layout of pyspark df
nlcd_ca = nlcd_ca.select('lat', 'lon', 'code')

# Convert to pandas df
nlcd_ca = nlcd_ca.toPandas()

In [None]:
# Add land code to the training set (brute force method)
# Processing whole dataset will take about 1 hour
bears_df2 = bears_df.copy()
bears_df2['code'] = '99'
for i in range(bears_df2.shape[0]):
    bear_coord = bears_df2.iloc[i, 6:8].values
    
    temp_nlcd = nlcd_ca.copy()
    temp_nlcd['dist'] = temp_nlcd.apply(lambda x: dist_btw_2_coords(bear_coord, [x['lat'], x['lon']]), axis=1)
    code = temp_nlcd.sort_values('dist').head(1).iloc[:, 2].values[0]
    bears_df2.iloc[i, 8] = code
    print(f" bears_df2 row {i}", end="\r")

 bears_df2 row 1304


 bears_df2 row 5544

In [None]:
# Convert the pandas df back to pyspark df and push it to MongoDB for storage
bears = spark.createDataFrame(bears_df2)

bears.write.format("mongo")\
        .option("spark.mongodb.output.uri", connectionString)\
        .option("database", database)\
        .option("collection", "training_data")\
        .mode("append").save()