In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas requests

In [None]:
import os
import pandas as pd
import json
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the dataset
dataset_path = '/content/drive/MyDrive/archive'

# Define the states and their respective cities
states_cities = {
    "Florida": ["Miami", "Orlando"],
    "California": ["Los Angeles", "San Francisco"],
    "New York": ["New York", "Buffalo"],
    "Texas": ["Houston", "Austin"],
    "Washington": ["Seattle", "Spokane"],
    "Maryland": ["Baltimore", "Silver Spring"],
    "Arizona": ["Phoenix", "Tucson"]
}

# Function to read and process each CSV file
def read_csv(city):
    city_formatted = city.replace(" ", "")
    file_name = f"{city_formatted}_Final_2022-06-18.csv"
    file_path = os.path.join(dataset_path, file_name)
    try:
        df = pd.read_csv(file_path)
        return df['scientific_name'].dropna().tolist()
    except FileNotFoundError:
        return None

# Collecting all scientific names from all cities
all_scientific_names = {}
city_tree_counts = {}  # This will store the tree counts per city

for state, cities in states_cities.items():
    for city in cities:
        names = read_csv(city)
        if names:
            city_tree_counts[city] = len(names)  # Count the trees for the city
            for name in names:
                if name in all_scientific_names:
                    all_scientific_names[name].add(city)
                else:
                    all_scientific_names[name] = {city}

# Counting the number of cities each scientific name is shared among and selecting the top 15
shared_count = {name: len(cities) for name, cities in all_scientific_names.items()}
top_shared_names = sorted(shared_count, key=shared_count.get, reverse=True)[:15]

# Function to process each city and extract the top shared scientific names with tree counts
def process_top_shared_csv(state, city):
    city_formatted = city.replace(" ", "")
    file_name = f"{city_formatted}_Final_2022-06-18.csv"
    file_path = os.path.join(dataset_path, file_name)
    try:
        df = pd.read_csv(file_path)
        # Filter for top shared scientific names
        city_sci_names = df['scientific_name'].value_counts().to_dict()  # Get counts for each scientific name
        scientific_names = [{
            "name": name,
            "count": count
        } for name, count in city_sci_names.items() if name in top_shared_names]
        return {"name": city, "tree_count": city_tree_counts[city], "scientific_names": scientific_names}
    except FileNotFoundError:
        return None

# Process each city in the defined states and create a new structure
top_shared_results = []
for state, cities in states_cities.items():
    city_results = [process_top_shared_csv(state, city) for city in cities if process_top_shared_csv(state, city) is not None]
    if city_results:
        top_shared_results.append({"state": state, "cities": city_results})

# Save the top shared results to a JSON file in Google Drive
json_file_path = '/content/drive/MyDrive/scientific_names_top_shared.json'
with open(json_file_path, 'w') as json_file:
    json.dump(top_shared_results, json_file)

print(f"JSON file saved to {json_file_path}")