In [20]:
# !git clone https://github.com/kailas93/Tree_species.git

In [1]:
import pandas as pd
import glob
import os
import zipfile
import numpy as np

# # # Unzip the dataset
# zip_path = "/content/archive(1).zip"
extract_dir = "5M_trees"

# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_dir)

# Define which columns to keep
selected_columns = [
    'common_name', 'scientific_name', 'city', 'state',
    'longitude_coordinate', 'latitude_coordinate', 'address', 'condition',
    'native', 'height_binned_M', 'diameter_breast_height_binned_CM',
    'location_type', 'zipcode', 'neighborhood', 'location_name', 'ward',
    'district', 'overhead_utility', 'diameter_breast_height_CM', 'height_M'
]

# Check if CSV files contain actual data or are Git LFS pointers
exclude_files = {'Column_Headers_Dryad.csv', 'README_Dryad.txt'}
csv_files = [f for f in glob.glob(os.path.join(extract_dir, "*.csv")) if os.path.basename(f) not in exclude_files]

# Check if we have real data
if csv_files:
    sample_file = csv_files[0]
    with open(sample_file, 'r') as f:
        first_line = f.readline().strip()
    
    if "git-lfs" in first_line:
        print("⚠️  CSV files are Git LFS pointers, not actual data!")
        print("📥 Please download real data from: https://datadryad.org/stash/dataset/doi:10.5061/dryad.2rbnzs7hs")
        print("🔄 Creating sample data for demonstration...")
        
        # Create sample data for demonstration
        np.random.seed(42)
        n_samples = 10000
        
        tree_species = ['Oak', 'Maple', 'Pine', 'Birch', 'Cedar', 'Elm', 'Willow', 'Poplar', 'Ash', 'Cherry']
        cities = ['Austin', 'Dallas', 'Houston', 'Seattle', 'Portland', 'Denver', 'Phoenix', 'Chicago', 'Boston', 'New York']
        states = ['TX', 'WA', 'OR', 'CO', 'AZ', 'IL', 'MA', 'NY']
        
        sample_data = {
            'common_name': np.random.choice(tree_species, n_samples),
            'scientific_name': [f"Species_{i}" for i in range(n_samples)],
            'city': np.random.choice(cities, n_samples),
            'state': np.random.choice(states, n_samples),
            'longitude_coordinate': np.random.uniform(-125.0, -67.0, n_samples),
            'latitude_coordinate': np.random.uniform(25.0, 49.0, n_samples),
            'address': [f"Address_{i}" for i in range(n_samples)],
            'condition': np.random.choice(['Good', 'Fair', 'Poor'], n_samples),
            'native': np.random.choice(['Native', 'Non-native'], n_samples),
            'height_binned_M': np.random.uniform(1.0, 30.0, n_samples),
            'diameter_breast_height_binned_CM': np.random.uniform(5.0, 100.0, n_samples),
            'location_type': np.random.choice(['Street', 'Park', 'Yard'], n_samples),
            'zipcode': np.random.randint(10000, 99999, n_samples),
            'neighborhood': [f"Neighborhood_{i%50}" for i in range(n_samples)],
            'location_name': [f"Location_{i}" for i in range(n_samples)],
            'ward': np.random.randint(1, 20, n_samples),
            'district': np.random.randint(1, 10, n_samples),
            'overhead_utility': np.random.choice(['Yes', 'No'], n_samples),
            'diameter_breast_height_CM': np.random.uniform(5.0, 100.0, n_samples),
            'height_M': np.random.uniform(1.0, 30.0, n_samples)
        }
        
        merged_df = pd.DataFrame(sample_data)
        print(f"✅ Created sample dataset with {len(merged_df):,} records")
        
    else:
        print("✅ Found real CSV data files!")
        # Original code for processing real CSV files
        df_list = []
        for file in csv_files:
            df = pd.read_csv(file, low_memory=False)
            filtered_df = df[selected_columns].copy()
            df_list.append(filtered_df)

        merged_df = pd.concat(df_list, ignore_index=True)
        print(f"✅ Processed {len(csv_files)} CSV files with {len(merged_df):,} records")
else:
    print("❌ No CSV files found!")
    merged_df = pd.DataFrame()

# Add tree_id column
if not merged_df.empty:
    merged_df.insert(0, 'tree_id', ['tree_' + str(i) for i in range(1, len(merged_df) + 1)])

⚠️  CSV files are Git LFS pointers, not actual data!
📥 Please download real data from: https://datadryad.org/stash/dataset/doi:10.5061/dryad.2rbnzs7hs
🔄 Creating sample data for demonstration...
✅ Created sample dataset with 10,000 records


In [22]:
merged_df.isnull().sum()

tree_id                             0
common_name                         0
scientific_name                     0
city                                0
state                               0
longitude_coordinate                0
latitude_coordinate                 0
address                             0
condition                           0
native                              0
height_binned_M                     0
diameter_breast_height_binned_CM    0
location_type                       0
zipcode                             0
neighborhood                        0
location_name                       0
ward                                0
district                            0
overhead_utility                    0
diameter_breast_height_CM           0
height_M                            0
dtype: int64

In [23]:
# Drop columns with more than 3,038,500 missing values
threshold = 3038501
merged_df = merged_df.loc[:, merged_df.isnull().sum() <= threshold]


In [24]:
merged_df = merged_df.drop(columns=['diameter_breast_height_binned_CM'])


In [25]:
merged_df = merged_df.dropna(subset=[
    'common_name',
    'scientific_name',
    'longitude_coordinate',
    'latitude_coordinate',
    'condition',
    'diameter_breast_height_CM','address', 'city'
])


In [26]:
merged_df.isnull().sum()

tree_id                      0
common_name                  0
scientific_name              0
city                         0
state                        0
longitude_coordinate         0
latitude_coordinate          0
address                      0
condition                    0
native                       0
height_binned_M              0
location_type                0
zipcode                      0
neighborhood                 0
location_name                0
ward                         0
district                     0
overhead_utility             0
diameter_breast_height_CM    0
height_M                     0
dtype: int64

In [2]:
# Step 0: Remove tree species with < 2 samples
species_counts = merged_df['common_name'].value_counts()
valid_species = species_counts[species_counts >= 3].index.tolist()

# Keep only valid species
filtered_df = merged_df[merged_df['common_name'].isin(valid_species)].copy()

# Verify the filtering worked
assert filtered_df['common_name'].value_counts().min() >= 3, "Still has species with < 2 samples!"

# Continue with filtered data
data = filtered_df.copy()


In [28]:
data['common_name'].value_counts()

common_name
Oak       1053
Cherry    1034
Elm       1021
Willow    1017
Pine       996
Ash        994
Maple      985
Birch      971
Poplar     967
Cedar      962
Name: count, dtype: int64

In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Load data
df = filtered_df.copy()
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Load data
df = merged_df.copy()

# Optional: Simplify to genus
df['genus'] = df['scientific_name'].apply(lambda x: x.split()[0])

# Encode categorical variables (native, city, state)
df['native_encoded'] = df['native'].astype('category').cat.codes
df['city_encoded'] = df['city'].astype('category').cat.codes
df['state_encoded'] = df['state'].astype('category').cat.codes

# Features to use
feature_cols = ['latitude_coordinate', 'longitude_coordinate', 'diameter_breast_height_CM',
                'native_encoded', 'city_encoded', 'state_encoded']
X = df[feature_cols]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=50, algorithm='ball_tree')  # can tune n_neighbors
nn_model.fit(X_scaled)

# Prediction function
def recommend_species(lat, lon, diameter_cm, native, city, state, top_n=5):
    # Encode input
    native_code = df['native'].astype('category').cat.categories.get_loc(native)
    city_code = df['city'].astype('category').cat.categories.get_loc(city)
    state_code = df['state'].astype('category').cat.categories.get_loc(state)

    input_features = np.array([[lat, lon, diameter_cm, native_code, city_code, state_code]])
    input_scaled = scaler.transform(input_features)

    distances, indices = nn_model.kneighbors(input_scaled)

    # Get common names or genera from neighbors
    neighbors = df.iloc[indices[0]]
    species_counts = Counter(neighbors['common_name'])  # or use 'genus'

    # Top-N species
    top_species = species_counts.most_common(top_n)
    return top_species
# Optional: Simplify to genus
df['genus'] = df['scientific_name'].apply(lambda x: x.split()[0])

# Encode categorical variables (native, city, state)
df['native_encoded'] = df['native'].astype('category').cat.codes
df['city_encoded'] = df['city'].astype('category').cat.codes
df['state_encoded'] = df['state'].astype('category').cat.codes

# Features to use
feature_cols = ['latitude_coordinate', 'longitude_coordinate', 'diameter_breast_height_CM',
                'native_encoded', 'city_encoded', 'state_encoded']
X = df[feature_cols]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=50, algorithm='ball_tree')  # can tune n_neighbors
nn_model.fit(X_scaled)

# Prediction function
def recommend_species(lat, lon, diameter_cm, native, city, state, top_n=5):
    # Encode input
    native_code = df['native'].astype('category').cat.categories.get_loc(native)
    city_code = df['city'].astype('category').cat.categories.get_loc(city)
    state_code = df['state'].astype('category').cat.categories.get_loc(state)

    input_features = np.array([[lat, lon, diameter_cm, native_code, city_code, state_code]])
    input_scaled = scaler.transform(input_features)

    distances, indices = nn_model.kneighbors(input_scaled)

    # Get common names or genera from neighbors
    neighbors = df.iloc[indices[0]]
    species_counts = Counter(neighbors['common_name'])  # or use 'genus'

    # Top-N species
    top_species = species_counts.most_common(top_n)
    return top_species

In [31]:
# Example usage - using values that exist in our sample data
print("Available native values:", df['native'].unique())
print("Available cities:", sorted(df['city'].unique()))
print("Available states:", sorted(df['state'].unique()))

# Use values that exist in our sample data
recommendation = recommend_species(
    lat=30.2672,  # Austin, TX coordinates
    lon=-97.7431,
    diameter_cm=25.0,
    native='Native',  # Changed from 'naturally_occurring'
    city='Austin',    # Changed from 'Louisville' 
    state='TX',       # Changed from 'Kentucky'
    top_n=5
)

print("\nRecommended tree species:")
for species, count in recommendation:
    print(f"{species} (seen {count} times nearby)")

Available native values: ['Native' 'Non-native']
Available cities: ['Austin', 'Boston', 'Chicago', 'Dallas', 'Denver', 'Houston', 'New York', 'Phoenix', 'Portland', 'Seattle']
Available states: ['AZ', 'CO', 'IL', 'MA', 'NY', 'OR', 'TX', 'WA']

Recommended tree species:
Pine (seen 10 times nearby)
Poplar (seen 7 times nearby)
Birch (seen 6 times nearby)
Ash (seen 6 times nearby)
Cherry (seen 5 times nearby)




In [4]:
from tqdm import tqdm
from collections import defaultdict

def evaluate_recommender(X_scaled, df, model, top_k=5, sample_size=1000):
    correct = 0
    ranks = []

    for i in tqdm(range(sample_size)):
        x_query = X_scaled[i].reshape(1, -1)
        distances, indices = model.kneighbors(x_query)

        # exclude itself
        neighbor_indices = [idx for idx in indices[0] if idx != i][:top_k]
        true_species = df.iloc[i]['common_name']
        neighbor_species = df.iloc[neighbor_indices]['common_name'].tolist()

        if true_species in neighbor_species:
            correct += 1
            ranks.append(neighbor_species.index(true_species) + 1)
        else:
            ranks.append(0)

    hit_rate = correct / sample_size
    mean_rank = sum([1/r for r in ranks if r > 0]) / sample_size

    print(f"Top-{top_k} Hit Rate: {hit_rate:.4f}")
    print(f"Mean Reciprocal Rank: {mean_rank:.4f}")
    return hit_rate, mean_rank

# Run evaluation on a 1000-sample subset
evaluate_recommender(X_scaled, df, nn_model, top_k=5, sample_size=1000)


100%|██████████| 1000/1000 [00:00<00:00, 1745.38it/s]

Top-5 Hit Rate: 0.3870
Mean Reciprocal Rank: 0.1982





(0.387, 0.19821666666666668)

In [5]:
# Save scaler and model
import joblib
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(nn_model, 'nn_model.joblib')

# Also save the dataframe with encoded columns (needed for categories and lookup)
df.to_pickle('tree_data.pkl')

print("Saved scaler, model and data!")

Saved scaler, model and data!


In [6]:
def get_common_locations_for_species(tree_name, top_n=10):
    """
    Given a tree common name, return the top N most frequent locations.
    """
    species_df = df[df['common_name'] == tree_name]
    
    if species_df.empty:
        return f"No records found for species: {tree_name}"
    
    # You can group by city/state or full address
    location_counts = species_df.groupby(['city', 'state']) \
                                .size().reset_index(name='count') \
                                .sort_values(by='count', ascending=False) \
                                .head(top_n)
    
    return location_counts


In [None]:
# Test the location function with tree species that exist in our sample data
print("Available tree species in our data:")
print(df['common_name'].value_counts().head(10))

# Test with Oak which should exist
tree_name = 'Oak'
top_locations = get_common_locations_for_species(tree_name)  # Fixed: removed df parameter
print(f"\nTop locations where '{tree_name}' is commonly found:")
if isinstance(top_locations, str):  # Check if it's error message
    print(top_locations)
elif top_locations.empty:
    print(f"No records found for species: {tree_name}")
else:
    print(top_locations)
    
# Test with another species
tree_name = 'Pine'
top_locations = get_common_locations_for_species(tree_name)  # Fixed: removed df parameter
print(f"\nTop locations where '{tree_name}' is commonly found:")
if isinstance(top_locations, str):  # Check if it's error message
    print(top_locations)
elif top_locations.empty:
    print(f"No records found for species: {tree_name}")
else:
    print(top_locations)

Available tree species in our data:
common_name
Oak       1053
Cherry    1034
Elm       1021
Willow    1017
Pine       996
Ash        994
Maple      985
Birch      971
Poplar     967
Cedar      962
Name: count, dtype: int64

Top locations where 'Oak' is commonly found:
        city state  count
0     Austin    AZ     22
34    Denver    IL     21
59   Phoenix    MA     21
4     Austin    NY     19
15    Boston    WA     19
21   Chicago    OR     19
23   Chicago    WA     18
3     Austin    MA     18
51  New York    MA     18
27    Dallas    MA     17

Top locations where 'Pine' is commonly found:
        city state  count
33    Denver    CO     22
67  Portland    MA     21
52  New York    NY     20
41   Houston    CO     19
37    Denver    OR     18
3     Austin    MA     17
25    Dallas    CO     17
36    Denver    NY     17
19   Chicago    MA     17
58   Phoenix    IL     17


: 