In [1]:
import os
import json
import pandas as pd

# Base directory where your JSON folders are located
base_path = "NEW_DATA"
folders = ["nodes", "relations", "ways"]

dataframes = []

# Loop through each folder and file, load JSON, and flatten using pd.json_normalize
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    if not os.path.isdir(folder_path):
        print(f"Folder not found: {folder_path}")
        continue
    for file in os.listdir(folder_path):
        if file.endswith(".json"):
            file_path = os.path.join(folder_path, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
                continue

            df = pd.json_normalize(data)
            # Track source folder and file if needed
            df["source_folder"] = folder
            df["source_file"] = file
            dataframes.append(df)

# Combine all data into one DataFrame
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
else:
    combined_df = pd.DataFrame()

# Check if the required columns exist.
required_columns = {"name", "coordinates.latitude", "coordinates.longitude"}
if not required_columns.issubset(set(combined_df.columns)):
    print("Required columns are missing from the data. Please ensure your JSON files have 'name', 'coordinates.latitude', and 'coordinates.longitude'.")
else:
    # Remove duplicates by keeping only the first occurrence of each unique combination
    cleaned_df = combined_df.drop_duplicates(
        subset=["name", "coordinates.latitude", "coordinates.longitude"],
        keep="first"
    )

    # Create output folder "datatest" if it doesn't exist.
    output_dir = "datatest"
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned DataFrame to a CSV file.
    output_file = os.path.join(output_dir, "cleaned_data.csv")
    cleaned_df.to_csv(output_file, index=False)

    print(f"Cleaned dataframe saved to {output_file}")

Cleaned dataframe saved to datatest\cleaned_data.csv


In [2]:
import os
import pandas as pd

data_file = os.path.join("datatest", "cleaned_data.csv")

# Load the CSV data
data = pd.read_csv(data_file)

df = pd.DataFrame(data)

# Filter out the rows where 'tags' is NaN or unusual AND 'name' contains 'Unknown Place'
cleaned_df = df[
    ~(
        df['tags'].apply(lambda x: x in ["", "[]", None] or pd.isna(x)) &
        df['name'].str.contains("Unknown Place", case=False, na=False)
    )
]

# Define the file path
data_file = os.path.join("datatest", "cleaned_data1.csv")

# Save the cleaned DataFrame to the file
cleaned_df.to_csv(data_file, index=False)

print(f"Cleaned data saved to: {data_file}")

Cleaned data saved to: datatest\cleaned_data1.csv


In [3]:
import os
import pandas as pd
import ast  # to safely evaluate string representations of lists

# Define the path for the CSV file
data_file = os.path.join("datatest", "cleaned_data.csv")

# Load the CSV data
data = pd.read_csv(data_file)
df = pd.DataFrame(data)

# Filter out rows where 'tags' is NaN/unusual AND 'name' contains "Unknown Place"
cleaned_df = df[
    ~(
        df['tags'].apply(lambda x: x in ["", "[]", None] or pd.isna(x)) &
        df['name'].str.contains("Unknown Place", case=False, na=False)
    )
]


def clean_tags(tags):
    """
    Convert the tags from string to list (if needed) and remove unwanted entries.
    """
    try:
        # Convert string representation of list to an actual list
        tags_list = ast.literal_eval(tags)
    except (ValueError, SyntaxError):
        # If conversion fails, return the original tags
        return tags

    # Only proceed if the result is a list
    if isinstance(tags_list, list):
        # Remove 'yes' and '*' from the list
        cleaned = [tag for tag in tags_list if tag not in ["yes", "*"]]
        # Optionally, convert the list back to a string for CSV storage
        return str(cleaned)
    return tags


# Use .loc to avoid the SettingWithCopyWarning
cleaned_df.loc[:, 'tags'] = cleaned_df['tags'].apply(clean_tags)

# Define the output file path
output_file = os.path.join("datatest", "cleaned_data1.csv")

# Save the cleaned DataFrame to the file
cleaned_df.to_csv(output_file, index=False)
print(f"Cleaned data saved to: {output_file}")

Cleaned data saved to: datatest\cleaned_data1.csv


In [4]:
import os
import pandas as pd

# Load the CSV data
data_file = os.path.join("datatest", "cleaned_data1.csv")
data = pd.read_csv(data_file)
df = pd.DataFrame(data)

# Check for empty strings or unusual entries in 'tags' column
weird_values = df[
    df['tags'].apply(lambda x: x in ["", "[]", None] or pd.isna(x))
]

# Get the remaining rows
clean_values = df.drop(weird_values.index)

# Create the split folder if it doesn't exist
split_folder = os.path.join("datatest", "split")
os.makedirs(split_folder, exist_ok=True)

# Save both DataFrames to separate files
weird_file = os.path.join(split_folder, "weird_values.csv")
clean_file = os.path.join(split_folder, "clean_values.csv")

weird_values.to_csv(weird_file, index=False)
clean_values.to_csv(clean_file, index=False)

print(f"Weird values saved to: {weird_file}")
print(f"Clean values saved to: {clean_file}")

Weird values saved to: datatest\split\weird_values.csv
Clean values saved to: datatest\split\clean_values.csv


In [5]:
import os
import pandas as pd
import folium
import ast


def tag_contains(tag, target_tag):
    """
    Check if the tag (or list of tags) contains the target tag.
    The tag can be a list already or a string representing a list.
    """
    if isinstance(tag, list):
        return any(t.lower() == target_tag.lower() for t in tag)

    try:
        tag_list = ast.literal_eval(tag)
        if isinstance(tag_list, list):
            return any(t.lower() == target_tag.lower() for t in tag_list)
    except Exception:
        return target_tag.lower() in tag.lower()

    return False


# Define the path to the CSV file
data_file = os.path.join("datatest", "split", "clean_values.csv")

# Load the CSV data
try:
    df = pd.read_csv(data_file)
except Exception as e:
    print(f"Error loading file {data_file}: {e}")
    exit(1)

# Check if 'tags' column exists
if 'tags' in df.columns:
    # Convert the tags from string to list and flatten them
    all_tags = df['tags'].apply(lambda x: ast.literal_eval(
        x) if pd.notna(x) else []).explode()

    # Get unique tags
    unique_tags = all_tags.unique()
    print(f"Unique tags found: {unique_tags}")

    # Create output directory for maps
    output_dir = os.path.join("datatest", "maps")
    os.makedirs(output_dir, exist_ok=True)

    # Loop through each unique tag and create a map
    for tag in unique_tags:
        # Filter records containing the current tag
        tag_df = df[df["tags"].apply(lambda x: tag_contains(x, tag))]

        if tag_df.empty:
            print(f"No records found for tag '{tag}'. Skipping...")
            continue

        # Calculate the center of the coordinates for map centering
        center_lat = tag_df["coordinates.latitude"].mean()
        center_lon = tag_df["coordinates.longitude"].mean()

        # Create an interactive folium map centered on the computed location
        m = folium.Map(location=[center_lat, center_lon], zoom_start=2)

        # Add markers for each record with the current tag
        for idx, row in tag_df.iterrows():
            lat = row["coordinates.latitude"]
            lon = row["coordinates.longitude"]
            name = row.get("name", "Unknown")
            description = row.get("description", "No description available.")
            location_text = row.get("location", "No location specified.")

            # Create a popup message with relevant details
            popup_text = (
                f"<b>Name:</b> {name}<br>"
                f"<b>Description:</b> {description}<br>"
                f"<b>Location:</b> {location_text}"
            )
            folium.Marker(
                location=[lat, lon],
                popup=folium.Popup(popup_text, parse_html=True)
            ).add_to(m)

        # Save each map to the output directory with the tag as the filename
        output_map_file = os.path.join(output_dir, f"{tag}_map.html")
        m.save(output_map_file)
        print(f"Map for tag '{tag}' saved to {output_map_file}.")

else:
    print("'tags' column not found in the CSV file.")

Unique tags found: ['bus_station' 'restaurant' 'shelter' 'fuel' 'taxi' 'post_office'
 'parking' 'pharmacy' 'fast_food' 'fountain' 'bank' 'police' 'cafe' 'atm'
 'cinema' 'hospital' 'toilets' 'townhall' 'theatre' 'library' 'bar'
 'clinic' 'drinking_water' 'marketplace' 'fire_station' 'ice_cream'
 'recycling' 'bicycle_parking' 'post_box' 'hotel' 'pub' 'bench' 'dentist'
 'waste_basket' 'vending_machine' 'delicious' 'charging_station' 'embassy'
 'drinking_fountain' 'building' 'memorial' 'attraction' 'column'
 'city_gate' 'monument' 'ruins' 'archaeological_site' 'viewpoint' 'castle'
 'tomb' 'fort' 'artwork' 'cemetery' 'grave_yard' 'church'
 'place_of_worship' 'tower' 'clock' 'museum' 'cistern' 'boundary_stone'
 'sports_centre' 'playground' 'park' 'swimming_pool' 'fitness_centre'
 'stadium' 'garden' 'miniature_golf' 'sauna' 'bbq' 'school' 'ياغمور'
 'information' 'hostel' 'gallery' 'turkish_bath' 'public_bath' 'castel'
 'guest_house' 'picnic_site' 'camp_site' 'aquarium' 'chalet' 'theme_park'
 

In [None]:
import os
import pandas as pd
import ast

# List of tags to keep
travel_tags = [
    "viewpoint",
    "attraction",
    "castle",
    "fort",
    "tomb",
    "ruins",
    "archaeological_site",
    "monument",
    "memorial",
    "artwork",
    "museum",
    "gallery",
    "park",
    "garden",
    "nature_reserve",
    "playground",
    "theme_park",
    "water_park",
    "miniature_golf",
    "swimming_pool",
    "sauna",
    "zoo",
    "aquarium",
    "cinema",
    "theatre",
    "cafe",
    "bar",
    "pub",
    "ice_cream",
    "restaurant",
    "fast_food",
    "hotel",
    "hostel",
    "guest_house",
    "chalet",
    "camp_site",
    "picnic_site",
    "fountain",
    "city_gate",
    "tower",
    "clock",
    "citywalls",
    "marina",
    "ferry_terminal",
    "ship",
    "aqueduct",
    "sports_centre",
    "stadium",
    "fitness_centre",
    "golf_course",
    "track",
    "pitch"
]

# Define the path to the input CSV file
input_file = os.path.join("datatest", "split", "clean_values.csv")

# Define the path to the output CSV file
output_file = os.path.join("datatest", "filtered", "filtered_tags.csv")
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Load the CSV data
try:
    df = pd.read_csv(input_file)
except Exception as e:
    print(f"Error loading file {input_file}: {e}")
    exit(1)

# Check if 'tags' column exists
if 'tags' in df.columns:
    # Function to filter tags
    def filter_tags(tag_str):
        try:
            # Convert the tag string to a list
            tags = ast.literal_eval(tag_str)
            if isinstance(tags, list):
                # Keep only tags that are in the travel_tags list
                filtered = [tag for tag in tags if tag in travel_tags]
                return filtered if filtered else None  # Return None if list is empty
        except Exception:
            return None
        return None

    # Apply the filtering to the tags column
    df['tags'] = df['tags'].apply(filter_tags)

    # Drop rows where tags are now empty
    filtered_df = df.dropna(subset=['tags'])

    # Save the filtered DataFrame to a new CSV file
    filtered_df.to_csv(output_file, index=False)
    print(f"Filtered tags saved to {output_file}.")

else:
    print("'tags' column not found in the CSV file.")

Filtered tags saved to datatest\filtered\filtered_tags.csv.


In [7]:
import os
import pandas as pd
import ast

data_file = os.path.join("datatest", "filtered", "filtered_tags.csv")

if not os.path.exists(data_file):
    print(f"File not found: {data_file}")
else:
    data = pd.read_csv(data_file)
    df = pd.DataFrame(data)
    if 'tags' in df.columns:
        # Convert the tags from string to list and flatten them
        all_tags = df['tags'].apply(lambda x: ast.literal_eval(x)).explode()
        # Get unique tags
        unique_tags = all_tags.unique()
        print(unique_tags)
    else:
        print("'tags' column not found in the CSV file.")

['restaurant' 'fast_food' 'fountain' 'cafe' 'cinema' 'theatre' 'bar'
 'ice_cream' 'hotel' 'pub' 'memorial' 'attraction' 'city_gate' 'monument'
 'ruins' 'archaeological_site' 'viewpoint' 'castle' 'tomb' 'fort'
 'artwork' 'tower' 'clock' 'museum' 'sports_centre' 'playground' 'park'
 'swimming_pool' 'fitness_centre' 'stadium' 'garden' 'miniature_golf'
 'sauna' 'hostel' 'gallery' 'guest_house' 'picnic_site' 'camp_site'
 'aquarium' 'chalet' 'theme_park' 'zoo' 'ship' 'citywalls'
 'nature_reserve' 'pitch' 'water_park' 'ferry_terminal' 'track' 'marina'
 'golf_course' 'aqueduct']


In [8]:
data_file = os.path.join("datatest", "filtered", "filtered_tags.csv")
data = pd.read_csv(data_file)
df = pd.DataFrame(data)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26814 entries, 0 to 26813
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   26814 non-null  object 
 1   description            26814 non-null  object 
 2   location               26814 non-null  object 
 3   tags                   26814 non-null  object 
 4   coordinates.latitude   26814 non-null  float64
 5   coordinates.longitude  26814 non-null  float64
 6   source_folder          26814 non-null  object 
 7   source_file            26814 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.6+ MB


In [None]:
# import os
# import osmnx as ox
# import networkx as nx
# import math
# import pandas as pd

# # ---------------------------
# # Geospatial Utility Functions
# # ---------------------------


# def compute_bounding_box(lat, lon, radius_m):
#     """
#     Compute an approximate bounding box around a point (lat, lon) with a given radius (in meters).
#     Returns (min_lat, max_lat, min_lon, max_lon).
#     """
#     R = 6371000  # Earth's radius in meters
#     lat_rad = math.radians(lat)

#     # Calculate degree offsets
#     delta_lat = (radius_m / R) * (180 / math.pi)
#     delta_lon = (radius_m / (R * math.cos(lat_rad))) * (180 / math.pi)

#     min_lat = lat - delta_lat
#     max_lat = lat + delta_lat
#     min_lon = lon - delta_lon
#     max_lon = lon + delta_lon
#     return min_lat, max_lat, min_lon, max_lon


# def filter_by_bounding_box_and_tag(df, user_lat, user_lon, radius_m, search_tag):
#     """
#     Quickly filter POIs that fall within a bounding box around the user's location
#     and contain the specified tag.
#     """
#     min_lat, max_lat, min_lon, max_lon = compute_bounding_box(
#         user_lat, user_lon, radius_m)

#     # Filter by bounding box
#     filtered_df = df[
#         (df['coordinates.latitude'] >= min_lat) &
#         (df['coordinates.latitude'] <= max_lat) &
#         (df['coordinates.longitude'] >= min_lon) &
#         (df['coordinates.longitude'] <= max_lon)
#     ]

#     # Filter by search tag (case insensitive)
#     filtered_df = filtered_df[filtered_df['tags'].str.contains(
#         search_tag, case=False, na=False)]

#     # Convert filtered DataFrame to a list of dictionaries for easier processing later
#     candidates = filtered_df.to_dict(orient='records')
#     return candidates

# # ---------------------------
# # Routing Functions using OSMnx & NetworkX (Car Mode)
# # ---------------------------


# def get_network_graph(user_lat, user_lon, radius_m, travel_mode='drive'):
#     """
#     Download a street network graph centered on the user's location.
#     Supports multiple travel modes like 'drive' and 'walk'.
#     """
#     graph_dist = radius_m * 2
#     try:
#         graph = ox.graph_from_point(
#             (user_lat, user_lon), dist=graph_dist, network_type=travel_mode)
#         return graph
#     except Exception as e:
#         print(f"Error retrieving network graph for {travel_mode}:", e)
#         return None


# def get_route_distance(graph, user_lat, user_lon, candidate_lat, candidate_lon):
#     """
#     Compute the route (network) distance between the user's location and the candidate's location.
#     Returns distance in meters.
#     """
#     try:
#         user_node = ox.distance.nearest_nodes(graph, user_lon, user_lat)
#         candidate_node = ox.distance.nearest_nodes(
#             graph, candidate_lon, candidate_lat)
#         route_length = nx.shortest_path_length(
#             graph, user_node, candidate_node, weight='length')
#         return route_length
#     except Exception as e:
#         print(
#             f"Error computing route for candidate at ({candidate_lat}, {candidate_lon}):", e)
#         return float('inf')


# def get_top_n_by_route_distance_for_all_modes(candidates, user_lat, user_lon, radius_m, n=5):
#     """
#     Compute route distances for all candidates using both driving and walking modes.
#     """
#     modes = ['drive', 'walk']
#     all_results = {}

#     for mode in modes:
#         graph = get_network_graph(
#             user_lat, user_lon, radius_m, travel_mode=mode)
#         if graph is None:
#             print(
#                 f"Failed to retrieve the network graph for {mode}. Skipping this mode.")
#             continue

#         # Calculate route distance for each candidate
#         for poi in candidates:
#             candidate_lat = poi["coordinates.latitude"]
#             candidate_lon = poi["coordinates.longitude"]
#             route_distance = get_route_distance(
#                 graph, user_lat, user_lon, candidate_lat, candidate_lon)
#             poi[f"{mode}_route_distance_m"] = route_distance

#         # Filter to only those POIs that are within the route distance threshold
#         candidates_within_radius = [
#             poi for poi in candidates if poi[f"{mode}_route_distance_m"] <= radius_m]

#         # Sort by route distance (shortest first)
#         candidates_within_radius.sort(
#             key=lambda x: x[f"{mode}_route_distance_m"])

#         all_results[mode] = candidates_within_radius[:n]

#     return all_results


# # ---------------------------
# # Main Execution: Updated for Both Driving and Walking
# # ---------------------------
# if __name__ == "__main__":
#     # Load the POI data from the CSV file.
#     data_file = os.path.join("datatest", "filtered", "filtered_tags.csv")
#     df = pd.read_csv(data_file)
#     # print("Data Information:")
#     # df.info()

#     # Simulated user query parameters:
#     user_lat = 40.985660   # Example: Istanbul city center latitude
#     user_lon = 29.027361   # Example: Istanbul city center longitude
#     radius_m = 5000        # 1 km search radius
#     search_tag = "viewpoint"    # Example tag to filter for

#     # --- Step 1: Candidate Filtering by Bounding Box and Tag ---
#     candidates = filter_by_bounding_box_and_tag(
#         df, user_lat, user_lon, radius_m, search_tag)
#     print("Candidates after bounding box and tag filtering:")
#     for poi in candidates:
#         print(
#             f"  {poi['name']} at ({poi['coordinates.latitude']}, {poi['coordinates.longitude']}) with tags: {poi['tags']}")

#     # --- Step 2: Geospatial Analysis via Route Distances (Drive & Walk) ---
#     top_candidates = get_top_n_by_route_distance_for_all_modes(
#         candidates, user_lat, user_lon, radius_m, n=5)

#     # Display results for both modes
#     print("\nTop candidates based on route distances:")
#     for mode, results in top_candidates.items():
#         print(f"\n--- {mode.capitalize()} Mode ---")
#         if results:
#             for poi in results:
#                 print(
#                     f"{poi['name']} - Route Distance: {poi[f'{mode}_route_distance_m']:.2f} meters")
#         else:
#             print(
#                 f"No locations found within the specified route distance for {mode} mode.")

Candidates after bounding box and tag filtering:
  Unknown Place at (41.0042147, 28.973334) with tags: ['viewpoint']
  View of the Bosphorus at (41.0298935, 28.9870463) with tags: ['viewpoint']
  Unknown Place at (41.0162809, 28.9712566) with tags: ['viewpoint']
  Sarayburnu Manzarası at (41.0175429, 28.9857457) with tags: ['viewpoint']
  Unknown Place at (41.0138905, 28.9860012) with tags: ['viewpoint']
  Cesme at (41.0116571, 28.9719397) with tags: ['viewpoint']
  Unknown Place at (41.0105435, 28.9761932) with tags: ['viewpoint']
  Unknown Place at (41.0274154, 28.9728089) with tags: ['viewpoint']
  Unknown Place at (41.0012023, 28.973685) with tags: ['viewpoint']
  Unknown Place at (41.0197989, 28.9725249) with tags: ['viewpoint']
  Unknown Place at (41.019496, 28.9732748) with tags: ['viewpoint']
  Unknown Place at (41.0290514, 28.9830699) with tags: ['viewpoint']
  Unknown Place at (41.0145068, 28.9685011) with tags: ['viewpoint']
  Unknown Place at (41.0026752, 28.9813968) with t