# ADA Project
## Data cleaning

### Cleaning - Wolf data set

In [33]:
import pandas as pd

# Load the dataset
file_path = '../Data/Hebblewhite Alberta-BC Wolves.csv'
wolves_data = pd.read_csv(file_path)

# Rename the column "comments" to "Pack name"
wolves_data.rename(columns={'comments': 'Pack name'}, inplace=True)

# Convert any timestamps to datetime format if they're not already
wolves_data['timestamp'] = pd.to_datetime(wolves_data['timestamp'])

# Filter the data to include only observations from 2001 to 2011
wolves_data = wolves_data[(wolves_data['timestamp'].dt.year >= 2001) & (wolves_data['timestamp'].dt.year <= 2011)]

# Add 'individual-id' and 'animal-type' columns
wolves_data['individual-id'] = wolves_data['individual-local-identifier']
wolves_data['animal-type'] = 'Wolf'

# Filter the data to include only the specified packs
packs = ["Wildhorse", "Ranch", "Red Deer", "Cascade", "Bow Valley"]
wolves_data = wolves_data[wolves_data['Pack name'].isin(packs)]

# Update the 'Pack name' column for observations with individual-id B065
wolves_data.loc[wolves_data['individual-id'] == 'B065', 'Pack name'] = 'Cascade'

# Rearrange the columns to match the elk DataFrame
wolves_data = wolves_data[['individual-id', 'event-id', 'timestamp', 'location-long', 'location-lat', 'animal-type', 'Pack name']]

# Save the cleaned data
output_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advanced Data Analysis/ADA_Project/Data/clean_data_wolf.csv'
wolves_data.to_csv(output_path, index=False)

# Display the head to verify
print(wolves_data.head())

  wolves_data = pd.read_csv(file_path)


  individual-id    event-id           timestamp  location-long  location-lat  \
0          B042  2048103217 2002-12-20 19:02:06    -115.901616     51.656829   
1          B042  2048103218 2002-12-20 23:01:18    -115.722444     51.686639   
2          B042  2048103219 2002-12-21 03:02:10    -115.787129     51.679586   
3          B042  2048103220 2002-12-21 08:02:11    -115.819219     51.681042   
4          B042  2048103221 2002-12-21 09:01:30    -115.818068     51.680339   

  animal-type Pack name  
0        Wolf  Red Deer  
1        Wolf  Red Deer  
2        Wolf  Red Deer  
3        Wolf  Red Deer  
4        Wolf  Red Deer  


### Cleaning - Elk data set

In [36]:
import pandas as pd

# Load the dataset
file_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advanced Data Analysis/ADA_Project/Data/Ya Ha Tinda elk project, Banff National Park, 2001-2023 (females).csv'
elk_data = pd.read_csv(file_path)

# Convert any timestamps to datetime format if they're not already
elk_data['timestamp'] = pd.to_datetime(elk_data['timestamp'])

# Add a new column 'animal-type' with the value 'Elk' for all rows
elk_data['animal-type'] = 'Elk'

# Rename the individual ID column
elk_data['individual-id'] = elk_data['individual-local-identifier']

# Filter the DataFrame to include only data from 2001 to 2011
elk_data = elk_data[elk_data['timestamp'].dt.year.between(2001, 2011)]

# Now create the cleaned DataFrame with only the necessary columns
elk_data = elk_data[['individual-id','event-id', 'timestamp', 'location-long', 'location-lat', 'animal-type']].copy()

# Save the cleaned data
output_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advanced Data Analysis/ADA_Project/Data/clean_data_elk.csv'
elk_data.to_csv(output_path, index=False)

print(elk_data.head())

  individual-id     event-id           timestamp  location-long  location-lat  \
0          4049  15155700828 2001-12-13 07:01:12    -115.804350     52.124099   
1          4049  15155702143 2001-12-13 09:01:07    -115.800270     52.117616   
2          4049  15155694994 2001-12-14 09:01:05    -115.828125     52.096107   
3          4049  15155693463 2001-12-14 11:00:49    -115.831771     52.098293   
4          4049  15155700885 2001-12-14 17:02:19    -115.804198     52.094824   

  animal-type  
0         Elk  
1         Elk  
2         Elk  
3         Elk  
4         Elk  


In [17]:
import os

absolute_path = os.path.dirname(__file__)
relative_path = "src/lib"
full_path = os.path.join(absolute_path, relative_path)

NameError: name '__file__' is not defined

### Merge clean_data_elk and clean_data_wolf

In [11]:
# Add a 'year_month' column to both datasets
wolves_data['year_month'] = wolves_data['timestamp'].dt.to_period('M')
elk_data['year_month'] = elk_data['timestamp'].dt.to_period('M')

# Get the unique year-month periods from wolf data
wolf_year_months = set(wolves_data['year_month'].unique())

# Filter the elk data to only include observations that match the wolf year-month periods
elk_data = elk_data[elk_data['year_month'].isin(wolf_year_months)]

# Concatenate the two DataFrames
combined_data = pd.concat([elk_data, wolves_data], ignore_index=True)

# If you no longer need the 'year_month' column, you can drop it before saving
combined_data = combined_data.drop(columns=['year_month'])

# Save the combined data
combined_data_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advandced Data Analysis/Project/Data/merged_data.csv'
combined_data.to_csv(combined_data_path, index=False)

print(combined_data.tail())

       individual-id    event-id           timestamp  location-long  \
201264          JW02  2048277689 2011-03-16 12:00:25    -115.723292   
201265          JW02  2048277690 2011-03-16 14:01:11    -115.724644   
201266          JW02  2048277691 2011-03-16 16:02:57    -115.740961   
201267          JW02  2048277692 2011-03-16 18:01:26    -115.743783   
201268          JW02  2048277693 2011-03-16 20:01:26    -115.743720   

        location-lat animal-type   Pack name  
201264     51.178754        Wolf  Bow Valley  
201265     51.178865        Wolf  Bow Valley  
201266     51.185789        Wolf  Bow Valley  
201267     51.192357        Wolf  Bow Valley  
201268     51.192093        Wolf  Bow Valley  


#### Distance travelled between each observation 

In [12]:
import pandas as pd
from geopy.distance import geodesic

# Load the dataset
file_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advandced Data Analysis/Project/Data/merged_data.csv'
data = pd.read_csv(file_path)

# Ensure the timestamp column is a datetime object and sort by 'individual-id' and 'timestamp'
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.sort_values(by=['individual-id', 'timestamp'], inplace=True)

# Filter out the wolf data
wolf_data = data[data['animal-type'] == 'Wolf'].copy()

# Initialize an empty list to store distances
distances = []

# Loop through the DataFrame by each wolf
for wolf_id in wolf_data['individual-id'].unique():
    # Get the specific wolf's data
    wolf_observations = wolf_data[wolf_data['individual-id'] == wolf_id].sort_values('timestamp')
    
    # Start with a distance of 0.0 for the first observation of each wolf
    individual_distances = [0.0]
    
    # Compute the distance between each consecutive observation
    for i in range(1, len(wolf_observations)):
        # Get the current and previous observation's location
        current_location = (wolf_observations.iloc[i]['location-lat'], wolf_observations.iloc[i]['location-long'])
        previous_location = (wolf_observations.iloc[i - 1]['location-lat'], wolf_observations.iloc[i - 1]['location-long'])
        
        # Calculate the distance and append to the individual_distances list
        distance = geodesic(previous_location, current_location).kilometers
        individual_distances.append(distance)

    # Extend the main distances list with the individual distances
    distances.extend(individual_distances)

# Add the distances as a new column to the wolf_data DataFrame
wolf_data['distance_traveled'] = distances

# merge this information back into the main DataFrame
data = data.drop(columns=['distance_traveled'], errors='ignore')  # Drop the column if it already exists
data = data.merge(wolf_data[['individual-id', 'timestamp', 'distance_traveled']], on=['individual-id', 'timestamp'], how='left')

data.to_csv('/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advandced Data Analysis/Project/Data/merged_data.csv', index=False)


  data = pd.read_csv(file_path)


### Add proximity metrics

In [14]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from geopy.distance import geodesic

# Load the dataset
file_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advandced Data Analysis/Project/Data/merged_data.csv'
data = pd.read_csv(file_path)

# Separate elk and wolf data based on 'animal-type'
elk_data = data[data['animal-type'] == 'Elk']
wolf_data = data[data['animal-type'] == 'Wolf']

# Perform K-Means clustering on elk data
elk_coords = elk_data[['location-lat', 'location-long']]
kmeans = KMeans(n_clusters=11, random_state=0).fit(elk_coords)
centroids = kmeans.cluster_centers_

# Initialize list to store proximity data
proximity_values = []

# Calculate proximity for each wolf observation
for index, wolf in wolf_data.iterrows():
    nearest_distance = float('inf')
    for centroid in centroids:
        distance = geodesic((wolf['location-lat'], wolf['location-long']), centroid).kilometers
        if distance < nearest_distance:
            nearest_distance = distance

    # Calculate proximity metric and store it with index to ensure alignment
    proximity_value = 1 / (nearest_distance + 1)  # Inverse of distance to model closeness
    proximity_values.append({'index': index, 'Proximity': proximity_value})

# Create a DataFrame from the proximity values
proximity_df = pd.DataFrame(proximity_values).set_index('index')

# Join the proximity data back to the wolf data using the index
wolf_data = wolf_data.join(proximity_df, on=wolf_data.index, rsuffix='_proximity')

# Update the main data DataFrame with the new proximity values for wolves
data.loc[wolf_data.index, 'Proximity'] = wolf_data['Proximity']

# Save the updated DataFrame
data.to_csv('/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advandced Data Analysis/Project/Data/merged_data.csv', index=False)


  data = pd.read_csv(file_path)


#### Set index as int 
Indexing was altered in the merging process

In [15]:
import pandas as pd

# Load the dataset
file_path = '/Users/robin/Library/Mobile Documents/com~apple~CloudDocs/UNIL/HEC cours/Master/MA 2/Advandced Data Analysis/Project/Data/merged_data.csv'
data = pd.read_csv(file_path)

# Convert the 'event-id' column to integer type
data['event-id'] = data['event-id'].astype(int)

# Optionally, save the DataFrame if you need to preserve changes
data.to_csv(file_path, index=False)

  data = pd.read_csv(file_path)
