<a href="https://colab.research.google.com/github/psrathi24/Portfolio_Projects/blob/main/SF_Films_Project_Complete_P1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Original Dataset: https://data.sfgov.org/Culture-and-Recreation/Film-Locations-in-San-Francisco/yitu-d5am/about_data


Upload files from "FILES TO INITIALIZE 1"

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

!pip install cinemagoer
from imdb import Cinemagoer

from geopy.distance import geodesic
import folium
from folium.plugins import Search, MarkerCluster
import branca
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors

Collecting cinemagoer
  Downloading cinemagoer-2023.5.1-py3-none-any.whl.metadata (2.9 kB)
Downloading cinemagoer-2023.5.1-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/297.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.2/297.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cinemagoer
Successfully installed cinemagoer-2023.5.1


In [None]:
sf_movies = pd.read_csv("yitu-d5am_version_88.csv")
pd.set_option('display.width', None)
sf_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2084 entries, 0 to 2083
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               2084 non-null   object
 1   release_year        2084 non-null   int64 
 2   locations           2030 non-null   object
 3   fun_facts           464 non-null    object
 4   production_company  2082 non-null   object
 5   distributor         1945 non-null   object
 6   director            2079 non-null   object
 7   writer              2074 non-null   object
 8   actor_1             2080 non-null   object
 9   actor_2             1991 non-null   object
 10  actor_3             1612 non-null   object
 11  state               2084 non-null   object
 12  city                2084 non-null   object
 13  point               2083 non-null   object
dtypes: int64(1), object(13)
memory usage: 228.1+ KB


In [None]:
# Drop useless columns and rows with null data:

useless_columns = ["fun_facts", "production_company", "distributor", "actor_1", "actor_2", "actor_3", "state", "city"]
sf_movies.drop(useless_columns, axis = 1, inplace = True)
sf_movies = sf_movies.dropna(subset = ["locations", "director", "writer", "point"]) # drop rows with null values in the specified subset

In [None]:
sf_movies.info() # 2022 non-null rows remain

<class 'pandas.core.frame.DataFrame'>
Index: 2022 entries, 0 to 2083
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         2022 non-null   object
 1   release_year  2022 non-null   int64 
 2   locations     2022 non-null   object
 3   director      2022 non-null   object
 4   writer        2022 non-null   object
 5   point         2022 non-null   object
dtypes: int64(1), object(5)
memory usage: 110.6+ KB


In [None]:
# Order rows by latest release_year:

sf_movies = sf_movies.sort_values(by = "release_year", ascending = False)
sf_movies.value_counts("title")

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
Looking,41
The Phone/Jexi,39
The Last Black Man in San Francisco,33
DEVS,32
Nash Bridges,30
...,...
Seven Girlfriends,1
Serial,1
Fat Man and Little Boy,1
Rent,1


Counting the unique values (280) in "title" shows repetition. Some films shot at multiple locations, while others shot at 1.


In [None]:
# Create buckets based on value_counts per title:

counts = sf_movies.value_counts("title")
filter_1 = counts[counts < 2]
filter_2 = counts[(counts >= 2) & (counts < 5)]
filter_3 = counts[(counts >= 5) & (counts < 10)]
filter_4 = counts[(counts >= 10) & (counts < 15)]
filter_5 = counts[(counts >= 15) & (counts < 20)]
filter_6 = counts[(counts >= 20) & (counts < 25)]
filter_7 = counts[(counts >= 25) & (counts < 30)]
filter_8 = counts[(counts >= 30) & (counts < 35)]
filter_9 = counts[(counts >= 35) & (counts < 40)]
filter_10 = counts[counts >= 40]

print(f"< 2 count:   {filter_1.value_counts().sum()}")
print(f"2-5 count:   {filter_2.value_counts().sum()}")
print(f"5-10 count:  {filter_3.value_counts().sum()}")
print(f"10-15 count: {filter_4.value_counts().sum()}")
print(f"15-20 count: {filter_5.value_counts().sum()}")
print(f"20-25 count: {filter_6.value_counts().sum()}")
print(f"25-30 count: {filter_7.value_counts().sum()}")
print(f"30-35 count: {filter_8.value_counts().sum()}")
print(f"35-40 count: {filter_9.value_counts().sum()}")
print(f">= 40 count: {filter_10.value_counts().sum()}")

< 2 count:   72
2-5 count:   79
5-10 count:  53
10-15 count: 30
15-20 count: 21
20-25 count: 12
25-30 count: 7
30-35 count: 4
35-40 count: 1
>= 40 count: 1


Based on the buckets, some comparisons would make more sense for films shot in more than one location. So dataset was split into sf_movies_1 (count < 2) & sf_movies_2 (count >= 2).

In [None]:
# Split dataset based on counts:

sf_movies_1 = sf_movies[sf_movies["title"].isin(filter_1.index)]
print(f"Films w/ count < 2: {filter_1.value_counts().sum()}")

temp_filter = counts[counts >= 2]
sf_movies_2 = sf_movies[sf_movies["title"].isin(temp_filter.index)]
print(f"Films w/ count >= 2: {temp_filter.value_counts().sum()}")

# sf_movies_1: 72 rows
# sf_movies_2: 1950 rows

Films w/ count < 2: 72
Films w/ count >= 2: 208


Feature Engineering:

* imdb_rating: average rating based on IMDb
* genre: genre tags based on IMDb
* type: movie, tv show, etc.
* release_decade: calculated from "release_year" column
* longitude & latitude: split "point" column into two
* tourist_spot: binary indicator if shooting location appears at/near a known tourist spot; 1 (yes), 0 (no)

First done for sf_movies_1, then sf_movies_2.

In [None]:
# For imdb_rating, genre, type:

ia = Cinemagoer() # Initialize IMDb instance

def get_imdb_rating(title):
    try:
        movie = ia.search_movie(title)  # Returns list of movie results that match title
        if movie:
            movie_id = movie[0].movieID          # Extracts movieID of first result
            movie_info = ia.get_movie(movie_id)  # Retrieves detailed info about movie based on movieID
            rating = movie_info.get('rating')
            genres = movie_info.get('genres')
            kind = movie_info.get('kind')        # type of the movie (e.g., movie, TV series)
            return rating, genres, kind          # Extracts rating, genres, and type from movie_info
        else:
            return None, None, None
    except Exception as e:
        return None, None, None

# Apply the function to get IMDb data
sf_movies_1[['imdb_rating', 'genres', 'type']] = sf_movies_1['title'].apply(lambda x: pd.Series(get_imdb_rating(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1[['imdb_rating', 'genres', 'type']] = sf_movies_1['title'].apply(lambda x: pd.Series(get_imdb_rating(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1[['imdb_rating', 'genres', 'type']] = sf_movies_1['title'].apply(lambda x: pd.Series(get_imdb_rating(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [None]:
# Check for null values, update values, drop rows:

sf_movies_1[sf_movies_1["imdb_rating"].isnull()]
sf_movies_1[sf_movies_1["genres"].isnull()]
sf_movies_1[sf_movies_1["type"].isnull()]

# Updates:
sf_movies_1.loc[534, "imdb_rating"] = 7.4
sf_movies_1.loc[548, "imdb_rating"] = 5.9

# Drop:
sf_movies_1.drop(index = [84, 601, 521], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1.drop(index = [84, 601, 521], inplace = True)


In [None]:
# For release_decade:

sf_movies_1["release_decade"] = (sf_movies_1["release_year"] // 10) * 10 # Calculates release decade based on release year (e.g. 1998 becomes 1990)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1["release_decade"] = (sf_movies_1["release_year"] // 10) * 10 # Calculates release decade based on release year (e.g. 1998 becomes 1990)


In [None]:
# For latitude & longitude:

def extract_coordinates(point):
    point = point.replace('POINT (', '').replace(')', '') # Remove the 'POINT (' and ')'
    lon, lat = point.split()                              # Split the remaining string by space to get longitude and latitude
    return pd.Series([float(lon), float(lat)], index=['longitude', 'latitude'])

sf_movies_1[['longitude', 'latitude']] = sf_movies_1['point'].apply(extract_coordinates)
sf_movies_1.drop("point", axis = 1, inplace = True) # Drop "point" column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1[['longitude', 'latitude']] = sf_movies_1['point'].apply(extract_coordinates)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1[['longitude', 'latitude']] = sf_movies_1['point'].apply(extract_coordinates)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1.drop("point", axis = 1, inplace = True) # Drop "point" column


In [None]:
# For tourist_spot:

sf_tourist_spots = pd.read_excel("SF Tourist Spots.xlsx") # pre-defined list of tourist spots
sf_tourist_spots.loc[0, ['Latitude', 'Longitude']] = [37.827221, -122.423268]
sf_tourist_spots.loc[9, ['Latitude', 'Longitude']] = [37.795322, -122.393929]

def calculate_distance(row, tourist_spots):       # Calculates distance from filming location to nearest tourist spot
    location = (row['latitude'], row['longitude'])
    distances = sf_tourist_spots.apply(lambda x: geodesic(location, (x['Latitude'], x['Longitude'])).meters, axis=1)
    return distances.min()

sf_movies_1['min_distance_to_tourist_spot'] = sf_movies_1.apply(calculate_distance, tourist_spots = sf_tourist_spots, axis = 1)

def classify_by_threshold(df, threshold):         # Classifies location based on threshold--the maximum distance from nearest tourist spot
    return df['min_distance_to_tourist_spot'].apply(lambda x: 1 if x <= threshold else 0)

# Threshold search optimization:
thresholds = np.arange(0, 1001, 50)
results = []
for threshold in thresholds:
    sf_movies_1['is_tourist_spot'] = classify_by_threshold(sf_movies_1, threshold)
    correct_classifications = sf_movies_1['is_tourist_spot'].sum()
    results.append((threshold, correct_classifications))

# Finds optimal threshold:
optimal_threshold, max_correct_classifications = max(results, key=lambda x: x[1])

# Final classification using the optimal threshold:
sf_movies_1['is_tourist_spot'] = classify_by_threshold(sf_movies_1, optimal_threshold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1['min_distance_to_tourist_spot'] = sf_movies_1.apply(calculate_distance, tourist_spots = sf_tourist_spots, axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_movies_1['is_tourist_spot'] = classify_by_threshold(sf_movies_1, threshold)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# Update values:

sf_movies_1.loc[1957, "is_tourist_spot"] = 1
sf_movies_1.loc[565, "is_tourist_spot"] = 1
sf_movies_1.loc[1770, "is_tourist_spot"] = 1
sf_movies_1.loc[25, "is_tourist_spot"] = 1
sf_movies_1.loc[534, "is_tourist_spot"] = 1

Repeat same process for sf_movies_2.

In [None]:
# For imdb_rating, genre, type:

sf_movies_2[['imdb_rating', 'genres', 'type']] = sf_movies_2['title'].apply(lambda x: pd.Series(get_imdb_rating(x)))

# Run-time for this block: 1h 26m 43s

In [None]:
# Check for null values, update values, drop rows:

sf_movies_2[sf_movies_2["imdb_rating"].isnull()]
sf_movies_2[sf_movies_2["genres"].isnull()]
sf_movies_2[sf_movies_2["type"].isnull()]

# Updates:
updates = [
    {"title": "Goliath- Season 4", "imdb_rating": 7.9, "genres": "Drama"},
    {"title": "Sense8 - Season 2", "imdb_rating": 8.8, "genres": "[Drama, Mystery, Sci-Fi, Thriller]"},
    {"title": "Silicon Valley Season 4", "imdb_rating": 8.1, "genres": "Comedy"},
    {"title": "Ballers Season 3", "imdb_rating": 7.9, "genres": "[Comedy, Drama, Sport]"},
    {"title": "Blindspotting (Season 2)", "imdb_rating": 7.5, "genres": "[Comedy, Crime, Drama]"},
    {"title": "Chance - Season 1 Pilot", "new_title": "Chance- Season 1 ep101", "imdb_rating": 7.9, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance - Season 1 ep105", "new_title": "Chance- Season 1 ep105", "imdb_rating": 8.3, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep102", "imdb_rating": 8.0, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep103", "imdb_rating": 8.1, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep104", "imdb_rating": 8.0, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep105", "imdb_rating": 8.3, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep106", "imdb_rating": 8.1, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep107", "imdb_rating": 8.5, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep108", "imdb_rating": 8.1, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep109", "imdb_rating": 8.6, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance- Season 1 ep110", "imdb_rating": 8.1, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Chance Season 2", "imdb_rating": 8.6, "genres": "[Drama, Mystery, Thriller]"},
    {"title": "Looking Season 2 ep 202", "imdb_rating": 8.3, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 203", "imdb_rating": 8.5, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 204", "imdb_rating": 8.4, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 205", "imdb_rating": 8.6, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 206", "imdb_rating": 8.4, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 207", "imdb_rating": 9.4, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 208", "imdb_rating": 8.4, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 209", "imdb_rating": 8.3, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Looking Season 2 ep 210", "imdb_rating": 9.0, "genres": "[Comedy, Drama, Romance]"},
    {"title": "Sonic the Hedgehog", "imdb_rating": 6.5},
    {"title": "Venom", "imdb_rating": 6.6, "genres": "[Action, Adventure, Sci-Fi]"},
    {"title": 'Looking "Special"', "new_title": "Looking (Special)", "imdb_rating": 7.5, "genres": "[Comedy, Drama]"}
]

for update in updates:
    if "new_title" in update:
        sf_movies_2.loc[sf_movies_2["title"] == update["title"], "title"] = update["new_title"]
        sf_movies_2.loc[sf_movies_2["title"] == update["new_title"], ["imdb_rating", "genres"]] = update.get("imdb_rating"), update.get("genres")
    else:
        sf_movies_2.loc[sf_movies_2["title"] == update["title"], ["imdb_rating", "genres"]] = update.get("imdb_rating"), update.get("genres")

sf_movies_2.loc[sf_movies_2['title'] == 'The OA Part II', 'imdb_rating'] = 8.6
sf_movies_2.loc[sf_movies_2['title'] == 'Mission (aka City of Bars)', 'imdb_rating'] = 10.0
sf_movies_2.loc[sf_movies_2['title'] == 'Sonic the Hedgehog', 'genres'] = "Action Adventure Comedy Sci-Fi"
sf_movies_2.loc[sf_movies_2['title'] == 'Mission (aka City of Bars)', 'genres'] = "Indie"

# Drop:
sf_movies_2.drop(index=sf_movies_2[sf_movies_2["title"] == "Smile Again, Jenny Lee"].index, inplace=True) # drops 25 rows

In [None]:
# For release_decade:

sf_movies_2["release_decade"] = (sf_movies_2["release_year"] // 10) * 10

In [None]:
# For latitude & longitude:

sf_movies_2[['longitude', 'latitude']] = sf_movies_2['point'].apply(extract_coordinates)
sf_movies_2.drop("point", axis = 1, inplace = True)   # Drop "point" column

In [None]:
# For tourist_spot:

sf_movies_2['min_distance_to_tourist_spot'] = sf_movies_2.apply(calculate_distance, tourist_spots = sf_tourist_spots, axis=1)

# Threshold search optimization:
thresholds = np.arange(0, 1001, 50)
results = []
for threshold in thresholds:
    sf_movies_2['is_tourist_spot'] = classify_by_threshold(sf_movies_2, threshold)
    correct_classifications = sf_movies_2['is_tourist_spot'].sum()
    results.append((threshold, correct_classifications))

# Finds optimal threshold:
optimal_threshold, max_correct_classifications = max(results, key=lambda x: x[1])

# Final classification using the optimal threshold:
sf_movies_2['is_tourist_spot'] = classify_by_threshold(sf_movies_2, optimal_threshold)

Update values and format of both files.

In [None]:
# sf_movies_1:

sf_movies_1.loc[1244, "title"] = "Good Neighbor Same"
sf_movies_1.loc[1081, "imdb_rating"] = 6.8
sf_movies_1.loc[1081, "genres"] = "Action Drama Sci-Fi Thriller"
sf_movies_1.loc[1081, "type"] = "movie"
sf_movies_1.loc[330, "imdb_rating"] = 5.2
sf_movies_1.loc[330, "genres"] = "Documentary Comedy Drama Fantasy Mystery Sci-Fi"
sf_movies_1.loc[330, "type"] = "documentary"
sf_movies_1.loc[113, "release_year"] = 2012
sf_movies_1.loc[1787, "genres"] = "Adventure Drama Sci-Fi"
sf_movies_1.loc[1787, "type"] = "movie"
sf_movies_1['genres'] = sf_movies_1['genres'].astype(str)  # Convert all values to strings
sf_movies_1['genres'] = sf_movies_1['genres'].str.replace(r"[\[\]']", '', regex=True).str.replace(',', ' ').str.strip() # removes [], '', "", ,, and white spaces

# sf_movies_2:

sf_movies_2.loc[sf_movies_2["title"] == 'Goliath- Season 4', "title"] = "Goliath (Season 4)"
sf_movies_2.loc[sf_movies_2["title"] == 'Silicon Valley Season 4', "title"] = "Silicon Valley (Season 4)"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance Season 2', "title"] = "Chance (Season 2)"
sf_movies_2.loc[sf_movies_2["title"] == 'Ballers Season 3', "title"] = "Ballers (Season 3)"
sf_movies_2.loc[sf_movies_2["title"] == 'Sense8 - Season 2', "title"] = "Sense8 (Season 2)"
sf_movies_2.loc[sf_movies_2["title"] == 'Murder in the First, Season 3', "title"] = "Murder in the First (Season 3)"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep110', "title"] = "Chance Season 1 Epiode 10"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep109', "title"] = "Chance Season 1 Epiode 9"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep108', "title"] = "Chance Season 1 Epiode 8"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep107', "title"] = "Chance Season 1 Epiode 7"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep106', "title"] = "Chance Season 1 Epiode 6"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep105', "title"] = "Chance Season 1 Epiode 5"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep104', "title"] = "Chance Season 1 Epiode 4"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep103', "title"] = "Chance Season 1 Epiode 3"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep102', "title"] = "Chance Season 1 Epiode 2"
sf_movies_2.loc[sf_movies_2["title"] == 'Chance- Season 1 ep101', "title"] = "Chance Season 1 Epiode 1"
sf_movies_2.loc[sf_movies_2["title"] == 'Terminator - Genisys', "title"] = "Terminator: Genisys"
sf_movies_2.loc[sf_movies_2["title"] == 'Murder in the First, Season 2', "title"] = "Murder in the First (Season 2)"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 210', "title"] = "Looking Season 2 Epiode 10"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 209', "title"] = "Looking Season 2 Epiode 9"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 208', "title"] = "Looking Season 2 Epiode 8"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 207', "title"] = "Looking Season 2 Epiode 7"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 206', "title"] = "Looking Season 2 Epiode 6"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 205', "title"] = "Looking Season 2 Epiode 5"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 204', "title"] = "Looking Season 2 Epiode 4"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 203', "title"] = "Looking Season 2 Epiode 3"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking Season 2 ep 202', "title"] = "Looking Season 2 Epiode 2"
sf_movies_2.loc[sf_movies_2["title"] == 'Murder in the First, Season 1', "title"] = "Murder in the First (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'Broken-A Modern Love Story', "title"] = "Broken: A Modern Love Story"
sf_movies_2.loc[sf_movies_2["title"] == 'The Last Thing He Told Me', "title"] = "The Last Thing He Told Me (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == "I'm A Virgo", "title"] = "I'm A Virgo (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'This Is Us', "title"] = "This Is Us (Season 6)"
sf_movies_2.loc[sf_movies_2["title"] == 'Surface', "title"] = "Surface (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'Super Pumped: The Battle for Uber', "title"] = "Super Pumped: The Battle for Uber (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == "Nash Bridges", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == 'Goliath (Season 4)', "type"] = "tv series"
sf_movies_2.loc[sf_movies_2["title"] == 'Tales of the City', "title"] = "Tales of the City (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'When We Rise', "title"] = "When We Rise (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'GirlBoss', "title"] = "GirlBoss (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'Sense8', "title"] = "Sense8 (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'Parks and Recreation', "title"] = "Parks and Recreation (Season 6)"
sf_movies_2.loc[sf_movies_2["title"] == 'Looking', "title"] = "Looking (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'Red Widow', "title"] = "Red Widow (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == 'Alcatraz', "title"] = "Alcatraz (Season 1)"
sf_movies_2.loc[sf_movies_2["title"] == "Twisted", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == "Dopamine", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == "The Bachelor", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == "Mother", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == "Interview With The Vampire", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == "Heart Beat", "type"] = "movie"
sf_movies_2.loc[sf_movies_2["title"] == "San Francisco", "type"] = "movie"
sf_movies_2['genres'] = sf_movies_2['genres'].astype(str)  # Convert all values to strings
sf_movies_2['genres'] = sf_movies_2['genres'].str.replace(r"[\[\]']", '', regex=True).str.replace(',', ' ').str.strip() # removes [], '', "", ,, and white spaces

In [None]:
# Order rows by latest release_year, then title:

sf_movies_1_updated = sf_movies_1.sort_values(by=['release_year', 'title'], ascending = False)
sf_movies_2_updated = sf_movies_2.sort_values(by=['release_year', 'title'], ascending = False)

In [None]:
# Export updated DataFrames as Excel files:

sf_movies_1_updated.to_excel("sf_movies_1.xlsx", index = True)
sf_movies_2_updated.to_excel("sf_movies_2.xlsx", index = True)

Created clusters of proximal points & visualized them for both files.

In [None]:
sf_movies_1 = pd.read_excel("sf_movies_1.xlsx")
sf_movies_2 = pd.read_excel("sf_movies_2.xlsx")

In [None]:
# For sf_movies_1:

coords = sf_movies_1[['latitude', 'longitude']].values # creates new array of latitude and longitude values

db = DBSCAN(eps = 0.0001, min_samples = 2, algorithm = 'ball_tree', metric='haversine').fit(np.radians(coords))
  # eps (radius): max distance between two samples to be considered in the same cluster
  # min_samples: number of samples in a cluster for a point to be considered a core point

sf_movies_1['location_cluster'] = db.labels_ # adds cluster labels to a new column

After plotting clustered points on a map:
* Latitude and longitude coordinates didn't match the location they were for
* Points were either improperly clustered or treated as stand-alone points

Updated values to correct.

In [None]:
# Updates:

sf_movies_1.loc[sf_movies_1['location_cluster'] == 3, 'location_cluster'] = 0
sf_movies_1.loc[sf_movies_1['title'] == 'Under the Tuscan Sun', 'location_cluster'] = 3
sf_movies_1.loc[sf_movies_1['title'] == 'Kamikaze Hearts', 'location_cluster'] = 3
sf_movies_1.loc[sf_movies_1['title'] == 'Big Sur', 'location_cluster'] = 12
sf_movies_1.loc[sf_movies_1['title'] == 'Live Nude Girls Unite', 'location_cluster'] = 12
sf_movies_1.loc[sf_movies_1['title'] == 'Until the End of the World', 'location_cluster'] = 12
sf_movies_1.loc[sf_movies_1['title'] == 'The Ten Commandments', 'location_cluster'] = -1
sf_movies_1.loc[sf_movies_1['title'] == 'Memoirs of an Invisible Man', 'location_cluster'] = 13
sf_movies_1.loc[sf_movies_1['title'] == 'Hard to Hold', 'location_cluster'] = 13
sf_movies_1.loc[sf_movies_1['title'] == 'Shoot the Moon', 'location_cluster'] = 13
sf_movies_1.loc[sf_movies_1['title'] == 'Midnight Lace', 'location_cluster'] = 13
sf_movies_1.loc[[25, 50, 55, 58, 59, 63, 68, 20, 51], 'location_cluster'] = [14] * 7 + [15] * 2
sf_movies_1.loc[sf_movies_1['title'] == 'Tucker: The Man and His Dream', 'location_cluster'] = 9
sf_movies_1.loc[
    [
        4, 43,               # Cluster 16
        5, 65,               # Cluster 17
        28, 64,              # Cluster 18
        67                   # Cluster -1
    ],
    'location_cluster'
] = (
    [16] * 2 +               # Cluster 16
    [17] * 2 +               # Cluster 17
    [18] * 2 +               # Cluster 18
    [-1]                     # Cluster -1
)

sf_movies_1.loc[42, ['latitude', 'longitude']] = [37.77927, -122.41923]
sf_movies_1.loc[19, ['latitude', 'longitude']] = [37.79513, -122.40299]

sf_movies_1.loc[19, 'locations'] = "600 Montgomery St."

In [None]:
print("\n Value Counts of Each Cluster:\n")
print(sf_movies_1["location_cluster"].value_counts())

In [None]:
# Visualize updated sf_movies_1 points:

# Create base map:
sf_map = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles = "CartoDB positron")
marker_cluster = MarkerCluster().add_to(sf_map)

# Add points from sf_tourist_spots to map as blue markers:
for index, row in sf_tourist_spots.iterrows():
    address_link = f"https://www.google.com/maps/search/?api=1&query={row['Address'].replace(' ', '+')}"
    popup_content = f"""Name: {row['Name']}<br><br>
                        Address: <a href="{address_link}" target="_blank">{row['Address']}</a>"""
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=folium.Popup(popup_content, max_width=150),
        icon=folium.Icon(color='blue')
    ).add_to(marker_cluster)

# Add points from sf_movies_1 to the map as red markers:
for index, row in sf_movies_1.iterrows():
    address_link = f"https://www.google.com/maps/search/?api=1&query={row['locations'].replace(' ', '+')}"
    popup_content = f"""Title: {row['title']}<br><br>
                        Current Index: {row.name}<br><br>
                        Cluster #: {row['location_cluster']}<br><br>
                        Locations: <a href="{address_link}" target="_blank">{row['locations']}</a>"""
    folium.Marker(
                  location=[row['latitude'], row['longitude']],
                  popup=folium.Popup(popup_content, max_width=150),
                  icon=folium.Icon(color='red')
              ).add_to(marker_cluster)
sf_map

# Add search functionality for both datasets:
  # Search keys allow for case-insensitive and partial matching queries

sf_tourist_spots['search_key'] = sf_tourist_spots['Name'].str.lower() + ' ' + sf_tourist_spots['Address'].str.lower()
sf_movies_1['search_key'] = (
    sf_movies_1['title'].str.lower() + ' ' +
    sf_movies_1['release_year'].astype(str) + ' ' +
    sf_movies_1['locations'].str.lower() + ' ' +
    sf_movies_1['director'].str.lower() + ' ' +
    sf_movies_1['writer'].str.lower() + ' ' +
    sf_movies_1['genres'].str.lower() + ' ' +
    sf_movies_1['release_decade'].astype(str)
)

search_tourist_spots = Search(
    layer=marker_cluster,
    search_label='search_key',
    placeholder='Search for tourist spots...',
    collapsed=False
).add_to(sf_map)

search_movies = Search(
    layer=marker_cluster,
    search_label='search_key',
    placeholder='Search for movies...',
    collapsed=False
).add_to(sf_map)

# Add map legend:
legend_html = """
{% macro html(this, kwargs) %}
<div style="
    position: fixed;
    bottom: 50px;
    left: 50px;
    width: 250px;
    height: 80px;
    z-index:9999;
    font-size:14px;
    border: 3px solid #333; /* Dark border color */
    border-radius: 5px; /* Optional: rounded corners for the border */
    padding: 10px; /* Optional: padding inside the legend */
    background-color: #ffffff;
    ">
    <p><a style="color:#d24831;font-size:150%;margin-left:20px;">◼</a>&emsp;Film Locations</p>
    <p><a style="color:#38aadd;font-size:150%;margin-left:20px;">◼</a>&emsp;Tourist Spots</p>
</div>
<div style="
    position: fixed;
    bottom: 50px;
    left: 50px;
    width: 150px;
    height: 80px;
    z-index:9998;
    font-size:14px;
    background-color: #ffffff;
    filter: blur(8px);
    -webkit-filter: blur(8px);
    opacity: 0.7;
    ">
</div>
{% endmacro %}
"""

legend = branca.element.MacroElement()
legend._template = branca.element.Template(legend_html)

# Add the legend to the map
sf_map.add_child(legend)

In [None]:
# Export sorted sf_movies_1 to CSV:

sf_movies_1.to_csv('sorted_sf_movies_1.csv', index=False)
from google.colab import files
files.download('sorted_sf_movies_1.csv')

In [None]:
# For sf_movies_2:

coords_1 = sf_movies_2[['latitude', 'longitude']].values
db_1 = DBSCAN(eps = 0.0001, min_samples = 2, algorithm = 'ball_tree', metric='haversine').fit(np.radians(coords_1))
sf_movies_2['location_cluster'] = db_1.labels_

After plotting clustered points on a map:
* Latitude and longitude coordinates didn't match the location they were for
* Points were either improperly clustered or treated as stand-alone points

Updated values to correct.

In [None]:
# Updates:

sf_movies_2.drop([47, 1227, 331, 626], inplace=True)

sf_movies_2.loc[
    [
        374, 840, 850, 69, 797, 319, 478, 647, 293, 864, 1241, 17, 1015, 1292,
        633, 375, 661, 480, 294, 1589, 1698, 1751, 316, 697, 1785, 320, 1822,
        1244, 1768, 299, 585, 1389, 1783, 388, 676, 728, 1298, 1545, 115
    ],
    ['latitude', 'longitude']
] = [
    [37.770614, -122.467507], [37.77849, -122.38951], [37.77849, -122.38951],
    [37.7912859, -122.4109353], [37.7922699, -122.4024322], [37.74755815683427, -122.38623370685208],
    [37.7618367, -122.4352533], [37.7618367, -122.4352533], [37.756248, -122.473052],
    [37.782931, -122.465056], [37.782471, -122.495377], [37.799165, -122.404097],
    [37.791841, -122.410828], [37.800916, -122.410088], [37.777880, -122.424820],
    [37.79232, -122.410750], [37.79232, -122.410750], [37.7846145, -122.3993685],
    [37.7723689, -122.3891842], [37.7767263, -122.3901611], [37.7767263, -122.3901611],
    [37.7767263, -122.3901611], [37.7483432, -122.4593798], [37.7483432, -122.4593798],
    [37.7483432, -122.4593798], [37.7383021, -122.3960362], [37.7511799, -122.4059027],
    [37.7704520788448, -122.46765842193204], [37.7704520788448, -122.46765842193204],
    [37.79510, -122.40699], [37.79504, -122.40754], [37.800916, -122.410088],
    [37.800916, -122.410088], [37.800916, -122.410088], [37.800916, -122.410088],
    [37.800916, -122.410088], [37.800916, -122.410088], [37.800916, -122.410088],
    [37.789420, -122.406940]
]


sf_movies_2.loc[sf_movies_2['location_cluster'] == 1603, ['longitude', 'latitude']] = [-122.38758, 37.76056]
sf_movies_2.loc[sf_movies_2['location_cluster'] == 17, ['latitude', 'longitude']] = [37.77927, -122.41923]

sf_movies_2.loc[sf_movies_2['location_cluster'] == 17, 'min_distance_to_tourist_spot'] = 0
sf_movies_2.loc[sf_movies_2['location_cluster'] == 17, 'is_tourist_spot'] = 1
sf_movies_2.loc[293, 'min_distance_to_tourist_spot'] = 0
sf_movies_2.loc[293, 'is_tourist_spot'] = 1
sf_movies_2.loc[299, 'min_distance_to_tourist_spot'] = 352.321496
sf_movies_2.loc[299, 'is_tourist_spot'] = 1
sf_movies_2.loc[585, 'min_distance_to_tourist_spot'] = 400.874045
sf_movies_2.loc[585, 'is_tourist_spot'] = 1
sf_movies_2.loc[[840, 850], 'min_distance_to_tourist_spot'] = 128.162086
sf_movies_2.loc[[840, 850], 'is_tourist_spot'] = 1
sf_movies_2.loc[115, 'min_distance_to_tourist_spot'] = 176.258433
sf_movies_2.loc[115, 'is_tourist_spot'] = 1
sf_movies_2.loc[[1389, 1783, 388, 676, 728, 1298, 1545], 'min_distance_to_tourist_spot'] = 136262.339396
sf_movies_2.loc[[1389, 1783, 388, 676, 728, 1298, 1545], 'is_tourist_spot'] = 0

sf_movies_2.loc[374, 'locations'] = "Golden Gate Music Concourse"
sf_movies_2.loc[682, 'locations'] = "San Francisco Fire Station 39"
sf_movies_2.loc[825, 'locations'] = "Stonestown Family YMCA"
sf_movies_2.loc[[840, 850, 1021], 'locations'] = "Oracle Park"
sf_movies_2.loc[[478, 647], 'locations'] = "Castro St. between 17th & 18th"
sf_movies_2.loc[[1389, 1783, 388, 676, 728, 1298, 1545], 'locations'] = "Washington Square Park"

sf_movies_2.loc[sf_movies_2['location_cluster'].isin([15, 19]), 'location_cluster'] = 0
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Fairmont Hotel', na=False), 'location_cluster'] = 36
sf_movies_2.loc[sf_movies_2['locations'].str.contains('1000 Mason St', na=False), 'location_cluster'] = 36
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Mark Hopkins', na=False), 'location_cluster'] = 36
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Grace Cathedral', na=False), 'location_cluster'] = 36
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Chinatown', na=False), 'location_cluster'] = 11
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Waverly', na=False), 'location_cluster'] = 11
sf_movies_2.loc[sf_movies_2['locations'].str.contains('666 Filbert', na=False), 'location_cluster'] = 33
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Golden Gate Park', na=False), 'location_cluster'] = 9
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Golden Gate National Recreation Area', na=False), 'location_cluster'] = 15
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Coit Tower', na=False), 'location_cluster'] = 41
sf_movies_2.loc[sf_movies_2['locations'].str.contains('Fine Arts', na=False), 'location_cluster'] = 42


sf_movies_2.loc[[293, 840, 850, 1, 1021, 69, 797, 319], 'location_cluster'] = [-1, 27, 27, 27, 27, 36, 0, -1]
sf_movies_2.loc[
    [
        518, 739, 1194, 1241, 1608, 1862,                # Cluster 34
        602, 1133, 1157, 1762, 1766,                    # Cluster 35
        1132, 896, 902, 144, 1890, 46, 1895, 1109,      # Cluster 36
        763, 745, 150, 1015, 1180, 1336, 20, 1887,
        576, 1627, 541, 903, 436, 783, 578, 105,
        659, 558, 1896, 1240, 1805, 1190, 848,
        330, 336, 340, 686, 966, 970, 974,              # Cluster 37
        968, 1548, 1822,                                # Cluster 0
        825, 294, 480,                                  # Cluster -1
        700, 999,                                       # Cluster 39
        776,                                            # Cluster 16
        320,                                            # Cluster 37
        975, 1167,                                      # Cluster 28
        1589, 1698, 1751,                               # Cluster 38
        985                                             # Cluster 21
    ],
    'location_cluster'
] = (
    [34] * 6 +                                        # Cluster 34
    [35] * 5 +                                        # Cluster 35
    [36] * 31 +                                       # Cluster 36
    [37] * 7 +                                        # Cluster 37
    [0] * 3 +                                         # Cluster 0
    [-1] * 3 +                                        # Cluster -1
    [39] * 2 +                                        # Cluster 39
    [16] * 1 +                                        # Cluster 16
    [37] * 1 +                                        # Cluster 37
    [28] * 2 +                                        # Cluster 28
    [38] * 3 +                                        # Cluster 38
    [21] * 1                                          # Cluster 21
)

sf_movies_2.loc[
    [
        299, 585, 707, 711, 1431, 742, 980, 238, 1837, 27, 1740, 1232, 1101, 348, 593,
        1446, 1623, 583, 1209, 1234, 1447, 856, 1407, 1554, 86, 1907, 45, 386, 391, 395,
        49, 1524, 147, 139, 429,                             # Cluster 11 (First Set)
        118, 157, 1624, 155, 737, 1617, 646, 1581, 1616, 888, 1150, 648, 1533, 457, 590,
        5, 370, 1386, 986, 854, 1660, 1739, 1789, 1891, 1905,
        7, 810, 387, 589, 638, 1097, 1035, 1530, 25,           # Cluster 11 (Second Set)
        710, 729, 305, 1030, 453, 161, 367, 1029, 135, 380,
        774, 104, 740, 1433, 1649, 1639, 70, 71, 662, 1088,
        89, 1120, 1093, 1421, 1860, 838, 206, 81, 709, 785,
        811, 108, 1912, 1343, 778, 1344                        # Cluster 11 (Third Set)
    ],
    'location_cluster'
] = 11


sf_movies_2.loc[[1331, 490, 491, 442, 1558, 1886, 1223, 48], 'location_cluster'] = 41
sf_movies_2.loc[[1816, 852, 114, 1412, 44, 1025, 747, 1185, 1647, 82, 1615, 1405, 1763], 'location_cluster'] = 19
sf_movies_2.loc[
    [1806, 68, 152, 378, 553, 1840, 19, 158, 357, 434, 544, 859, 981, 1149, 1264, 1283, 1393, 1880, 430, 444, 1765,
     193, 841, 1348, 1881],
    'location_cluster'] = 29
sf_movies_2.loc[
    [124, 361, 741, 1037, 120, 543, 435, 1770, 1278, 546, 440, 289, 847, 1795, 1842, 839, 1864, 85, 242, 548, 577,
     773, 1016, 1488, 1514, 1515, 1587, 1689, 1731, 1747, 1788, 1867, 1878, 21, 505, 1124, 1213, 1354, 1621, 130, 1594],
    'location_cluster'] = 40
sf_movies_2.loc[[494, 249, 136, 1833, 1728, 1285, 1248, 640], 'location_cluster'] = 40
sf_movies_2.loc[[1451], 'location_cluster'] = 15
sf_movies_2.loc[[666, 1757, 406, 864, 116, 751, 874, 726, 693, 529, 373, 1110, 808, 802, 534], 'location_cluster'] = 4
sf_movies_2.loc[[1439, 522, 833, 233], 'location_cluster'] = 10
sf_movies_2.loc[[968, 190], 'location_cluster'] = 8
sf_movies_2.loc[[319], 'location_cluster'] = 0
sf_movies_2.loc[
    [1389, 1783, 388, 676, 728, 1298, 1545, 1111, 196, 1302, 843, 535, 220, 1300, 1316, 1779, 355, 160, 1436, 1443,
     263, 1564, 1458, 353, 281, 1457, 1116, 673, 397, 1305, 1456, 1392, 723, 1315, 1179, 1909, 1108, 1544],
    'location_cluster'] = 33
sf_movies_2.loc[
    [478, 647, 1052, 1549, 322, 317, 1199, 963, 835, 642, 463, 652, 644, 1087, 469, 470, 462, 474, 1314, 481, 596,
     925, 345, 219, 476, 928, 1068, 254, 350],
    'location_cluster'] = 18
sf_movies_2.loc[[861, 935, 374, 377], 'location_cluster'] = 9


sf_movies_2.loc[
    [
        1168,                                             # Cluster 42
        1154, 1573, 1638, 1780, 1804, 554, 1401, 265, 1566,  # Cluster 43
        1646, 1629, 1645, 971, 1188, 1632, 416, 2, 1640, 1146, 1635, 777,  # Cluster 44
        1444, 868, 288, 993, 1000, 873, 867, 531, 992,       # Cluster 45
        1487, 63, 1125, 441, 121,                         # Cluster 46
        133, 550, 423, 134, 1551, 67, 122, 270, 772, 1449, 428, 530, 536, # Cluster 47
        1796, 1042, 1374, 1911, 1644, 1034, 37, 1595, 1127, 1239, 1653, 523, 1472, 504, 1883, 1496, 248, 75, 194, 594, 1676, 524, 582, 156, 1318, 1210, 1504, 1902, 1119, # Cluster 48
        1040, 1415, 1756, 1916, 29, 1176, 1178, 696, 691, 1625, 131, 142, 1658, 579, 1175, 1855, 1438, 532, 1811, # Cluster 49
        510, 520, 295, 259, 1252, 1177, 400, 487, 258, 40, 1137, 291, 796, 1665, 1746, 28, 1815, 1191, 303, 509, # Cluster 50
        1569, 396, 1460, 580, 1814, 1445, 500, 1465, 354, 1152, 43, 92, 192, 743, 300, 551, 744, 60, 738, 1291, 107, 483, 1113, 1310, 33, 1171, 1118, 232, # Cluster 51
        9, 827, 845, 1350, 346, 16, 846, 1013, 1303, 1790, 1403, 1138, 342, 1688, 1572, 1769, 1004, 403, 1032, 1385, 1565, 1237, 222, 1217, 151, 1112, 261, 1214, 1807, 1043, 231, 1853, 988, # Cluster 52
        358, 1159, 1247, 1500, 1708, 1831, 1470, 141, 1145, 628, 488, 1812, 1845, 4, 1879, 1376, # Cluster 53
        1664, 556, 1399, 559, 1538, 514, 1117, # Cluster 54
        1440, 758, 1266, 679, 692, # Cluster 55
        1754, 306, 1873, 566, 568, 329, 1710, 1714, 1707, 1742, 1686, 296, 913, 899, 1065, 921, 1866, # Cluster 56
        1821, 705, 1329, 1332, 1141, 1502, 432, 1268, # Cluster 57
        1102, 1477, 732, 1490, 1507, 1861, 438, 439, 801, 392, 1570, 1575, 1031, 1107, 752, 1220, 1506, 73, 853, 654, 670, 132, 11, 885, 128, 597, 672, 12, 1221, 989, 1637, 1224, 1102, 343, 269, 1522, 1384, 394, 720  # Cluster 58
    ],
    'location_cluster'
] = (
    [42] +                                           # Cluster 42
    [43] * 9 +                                       # Cluster 43
    [44] * 12 +                                      # Cluster 44
    [45] * 9 +                                       # Cluster 45
    [46] * 5 +                                       # Cluster 46
    [47] * 13 +                                      # Cluster 47
    [48] * 28 +                                      # Cluster 48
    [49] * 19 +                                      # Cluster 49
    [50] * 19 +                                      # Cluster 50
    [51] * 27 +                                      # Cluster 51
    [52] * 34 +                                      # Cluster 52
    [53] * 17 +                                      # Cluster 53
    [54] * 7 +                                       # Cluster 54
    [55] * 5 +                                       # Cluster 55
    [56] * 17 +                                      # Cluster 56
    [57] * 8 +                                       # Cluster 57
    [58] * 40                                        # Cluster 58
)

DBSCAN was still not clustering as desired. So I separated the two largest clusters (0, -1) into a new DataFrame (temp_sort) and iteratively applied NearestNeighbors until all values were appropriately clustered. After each iteration, I manually mapped the new clusters to the old clusters and repeated the process with remaining points.

Note: The code below is the general code used for the above process. After each iteration, I deleted the accurately clustered rows from the file, before using it again.

In [None]:
temp_sort = pd.read_csv("semi_clustered_sf_movies_2.csv")

# Combine rows with 0 and -1 into -1:
temp_sort.loc[temp_sort['location_cluster'] == 0, 'location_cluster'] = -1

# Calculate centroids of existing clusters:
clustered_points = temp_sort[temp_sort['location_cluster'] != -1]
centroids = clustered_points.groupby('location_cluster')[['latitude', 'longitude']].mean()

# Separate unclustered points:
unclustered_points = temp_sort[temp_sort['location_cluster'] == -1]
unclustered_coords = unclustered_points[['latitude', 'longitude']].values

# Use NearestNeighbors to find the closest centroid for each unclustered point:
nearest_neighbors = NearestNeighbors(n_neighbors=1, metric='haversine')
nearest_neighbors.fit(np.radians(centroids))
distances, indices = nearest_neighbors.kneighbors(np.radians(unclustered_coords))

# Assign unclustered points to the nearest cluster:
unclustered_points['location_cluster'] = centroids.index[indices.flatten()].values
temp_sort.loc[temp_sort['location_cluster'] == -1, 'newly_clustered'] = unclustered_points['location_cluster'].values

In [None]:
# Updates:

sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1768, 1198, 1487, 1659, 396, 1516, 1721, 1818, 1089, 176, 771, 2051, 1348, 215, 1698, 2080, 1587, 404, 1260, 1894, 2031, 1671, 1156, 2074, 1750, 526, 1521, 2013, 2014, 104, 1253, 1798, 916, 390, 376, 1686, 2081, 226, 1187, 317, 1575, 596, 543, 1163, 1447, 1489, 2002, 109, 505, 1936, 1267, 1476, 1032, 1695]), 'location_cluster'] = 17
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1538, 631, 1052, 1684, 180, 1646, 1431, 529, 592, 1618, 1972]), 'location_cluster'] = 70
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([533, 1400, 1042, 1300, 791, 1432, 1510, 1766, 579, 46, 1469, 1834, 1470]), 'location_cluster'] = 67
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1566, 1685, 878, 2069, 1194, 528, 1071]), 'location_cluster'] = 78
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1961, 1026, 221, 545, 184, 788, 1812]), 'location_cluster'] = 79
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([66, 95, 185, 259, 668, 1002, 1058, 1319, 1444, 1498, 1557, 1578, 1678, 1976, 1512]), 'location_cluster'] = 77
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([265, 419, 560, 591, 701, 871, 942, 1016, 1037, 1102, 1261, 1401, 1558, 1781, 1807, 1739, 2016, 2034, 1828]), 'location_cluster'] = 68
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([101, 111, 144, 1597, 1644, 1744, 191, 1953, 287, 383, 373, 425, 429, 450, 672, 738, 909, 946, 1040, 1160, 1205, 1266, 1491, 1548, 1909]), 'location_cluster'] = 59
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([927, 650, 689, 1879, 1952, 492, 1228, 536, 1564, 252, 2032, 1179, 225]), 'location_cluster'] = 66
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([15, 16, 17, 72, 155, 159, 208, 237, 501, 645, 654, 667, 953, 1157, 1165, 1271, 1380, 1420, 1590, 1732, 1977, 2043]), 'location_cluster'] = 69
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([567, 1390, 1649, 1999, 1577, 1585, 729, 1170, 1693, 2036, 608, 272, 1427, 1003, 326, 119, 890]), 'location_cluster'] = 62
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([2070, 1601, 1297, 1762, 1589, 858, 1738, 1243, 329]), 'location_cluster'] = 73
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([56, 223, 832, 964, 1000, 1111, 1272, 1281, 1301, 1428, 1429, 1650, 1895, 1989]), 'location_cluster'] = 64
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([63, 272, 427, 486, 496, 540, 572, 675, 773, 808, 960, 994, 1183, 1333, 1365, 1479, 1616, 1696, 1904]), 'location_cluster'] = 60
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([173, 249, 343, 449, 1199, 1310, 1445, 2018]), 'location_cluster'] = 76
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1370, 1775, 1069, 1594, 530, 1887, 1801, 1131, 873, 2040, 638, 1144, 1435, 1805, 349, 402, 887, 933, 324]), 'location_cluster'] = 61
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([99, 162, 1625, 418, 422, 466, 570, 643, 989, 1080, 1151, 1159, 1209, 2050]), 'location_cluster'] = 65
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([30, 91, 241, 765, 797, 881, 910, 1059, 1175, 1263, 1305, 1468, 1583, 1720, 1799, 1876]), 'location_cluster'] = 63
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([706, 152, 36, 281, 1791, 149, 502, 563, 864, 1251]), 'location_cluster'] = 71
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([43, 140, 600, 732, 802, 945, 1031, 1737, 2017]), 'location_cluster'] = 75
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([86, 120, 263, 322, 553, 726, 1996, 1247, 1249, 1382]), 'location_cluster'] = 72
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([963, 74, 1407, 1246, 578, 1229, 589, 783, 574]), 'location_cluster'] = 74
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([368, 607, 814, 849, 870, 1009, 1177, 1565, 1655, 1749, 1772, 1824, 1898, 2066]), 'location_cluster'] = 80
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([34, 303, 770, 789, 885, 959, 1074, 1182, 1398, 1436, 1763, 1777]), 'location_cluster'] = 81
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([93, 143, 728, 861, 905, 995, 1129, 1191, 1434, 1517, 1940]), 'location_cluster'] = 82
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([518, 542, 883, 971, 1054, 1375, 1466, 1503, 1550, 1854, 2052]), 'location_cluster'] = 83
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([5, 179, 679, 682, 686, 904, 1142, 1282, 1352, 1627]), 'location_cluster'] = 84
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([623, 649, 674, 848, 974, 1460, 1675, 1719, 1724]), 'location_cluster'] = 85
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([94, 612, 1164, 1201, 1248, 1289, 1987, 2001]), 'location_cluster'] = 86
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([80, 746, 767, 1572, 1752, 2065, 298]), 'location_cluster'] = 87
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([490, 603, 1117, 1166, 1411, 1658]), 'location_cluster'] = 88
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([412, 820, 986, 1204, 1581, 1795]), 'location_cluster'] = 89
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([205, 424, 710, 1099, 1780, 1910]), 'location_cluster'] = 90
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([28, 110, 132, 485, 762, 1983]), 'location_cluster'] = 91
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([103, 381, 1109, 1374, 1965, 2064]), 'location_cluster'] = 92
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1561, 1525, 1513, 1931, 1978, 823]), 'location_cluster'] = 93
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([116, 166, 454, 620, 1394, 1982]), 'location_cluster'] = 94
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([642, 816, 825, 1203, 1286, 1315, 1942, 2053]), 'location_cluster'] = 69
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([97, 147, 677, 684, 847, 1135, 1472, 1582]), 'location_cluster'] = 91
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([68, 121, 593, 707, 886, 949, 1279, 1388, 1846]), 'location_cluster'] = 94
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([1, 4, 6, 33, 198, 270, 308, 354, 382, 405, 461, 509, 559, 691, 719, 733, 818, 830, 939, 968, 1153, 1212, 1235, 1255, 1360, 1471, 1549, 1559, 1614, 1621, 1626, 1636, 1664, 1713, 1755, 1849, 1932, 1949, 1997]), 'location_cluster'] = 95
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([255, 271, 280, 489, 605, 736, 800, 1325, 1462, 1478, 1506, 1600, 1826]), 'location_cluster'] = 96
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([58, 399, 497, 558, 713, 837, 1038, 1050, 1295, 1296, 1349, 1413, 1465, 1562, 1622, 1852, 1908, 2026]), 'location_cluster'] = 97
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([115, 188, 464, 488, 549, 759, 850, 1121, 1337, 1573, 1823, 1890, 1916, 1921]), 'location_cluster'] = 98
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([639, 780, 811, 1141, 1275, 1488, 1701, 2060]), 'location_cluster'] = 99
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([432, 793, 992, 1120, 1529, 1892, 2029]), 'location_cluster'] = 70
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([124, 286, 361, 610, 742, 845, 1154, 1188, 1224, 1283, 1367, 1386, 1416, 1483, 1708, 1747, 1816, 1847]), 'location_cluster'] = 100
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([18, 24, 42, 79, 81, 112, 139, 174, 227, 234, 250, 257, 258, 268, 293, 296, 316, 331, 334, 353, 374, 407, 414, 431, 438, 448, 453, 465, 473, 477, 498, 523, 538, 552, 555, 577, 580, 625, 640, 655, 699, 740, 778, 819, 838, 844, 851, 879, 888, 895, 901, 937, 999, 1020, 1024, 1039, 1041, 1043, 1053, 1073, 1084, 1086, 1110, 1116, 1132, 1145, 1152, 1185, 1207, 1219, 1227, 1236, 1252, 1322, 1344, 1366, 1373, 1395, 1415, 1452, 1455, 1490, 1495, 1542, 1551, 1567, 1586, 1603, 1628, 1661, 1666, 1670, 1673, 1691, 1694, 1726, 1742, 1748, 1757, 1760, 1784, 1792, 1809, 1836, 1837, 1845, 1864, 1867, 1925, 1939, 1958, 1985, 2022, 2023, 2024, 2033, 2039, 2057]), 'location_cluster'] = 101
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([3, 312, 318, 462, 615, 894, 1113, 1317, 1343, 1387, 1534, 1975, 2006]), 'location_cluster'] = 102
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([494, 993, 1098, 1134, 1497, 1533, 1817]), 'location_cluster'] = 103
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([76, 416, 544, 899, 1014, 1245, 1796]), 'location_cluster'] = 104
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([85, 148, 183, 506, 1425, 1707]), 'location_cluster'] = 105
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([70, 98, 493, 875, 907, 1075]), 'location_cluster'] = 106
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([338, 573, 785, 1104, 1331, 1877]), 'location_cluster'] = 107
sf_movies_2.loc[sf_movies_2['Unnamed: 0'].isin([731, 801, 813, 1259, 1652, 1881]), 'location_cluster'] = 108

Did a final check to ensure accuracy of all points and clusters. Updates noted below.

In [None]:
# Updates:

sf_movies_2.drop([106, 181, 1641, 1267, 109], inplace=True)

sf_movies_2.loc[479, ['latitude', 'longitude']] = [37.784140, -122.401620]
sf_movies_2.loc[482, ['latitude', 'longitude']] = [37.793837, -122.406231]
sf_movies_2.loc[506, ['latitude', 'longitude']] = [37.790440, -122.410338]
sf_movies_2.loc[668, ['latitude', 'longitude']] = [37.745569, -122.420195]
sf_movies_2.loc[755, ['latitude', 'longitude']] = [37.797051, -122.422065]
sf_movies_2.loc[799, ['latitude', 'longitude']] = [37.808495, -122.409605]
sf_movies_2.loc[855, ['latitude', 'longitude']] = [37.785359, -122.403090]
sf_movies_2.loc[996, ['latitude', 'longitude']] = [37.723073, -122.423116]
sf_movies_2.loc[1019, ['latitude', 'longitude']] = [37.776632, -122.392274]
sf_movies_2.loc[1227, ['latitude', 'longitude']] = [37.781671, -122.393892]
sf_movies_2.loc[874, ['latitude', 'longitude']] = [37.782944, -122.464742]
sf_movies_2.loc[643, ['latitude', 'longitude']] = [37.775870, -122.416642]
sf_movies_2.loc[1584, ['latitude', 'longitude']] = [37.773708, -122.387381]
sf_movies_2.loc[224, ['latitude', 'longitude']] = [37.794180, -122.406314]
sf_movies_2.loc[187, ['latitude', 'longitude']] = [37.795071, -122.403203]
sf_movies_2.loc[800, ['latitude', 'longitude']] = [37.770052, -122.447178]
sf_movies_2.loc[866, ['latitude', 'longitude']] = [37.792286, -122.410853]
sf_movies_2.loc[1078, ['latitude', 'longitude']] = [37.776231, -122.408298]
sf_movies_2.loc[1126, ['latitude', 'longitude']] = [37.790740, -122.389669]
sf_movies_2.loc[143, ['latitude', 'longitude']] = [37.781927, -122.407495]
sf_movies_2.loc[383, ['latitude', 'longitude']] = [37.792220, -122.412121]
sf_movies_2.loc[756, ['latitude', 'longitude']] = [37.780927, -122.514368]
sf_movies_2.loc[817, ['latitude', 'longitude']] = [37.761536, -122.423732]
sf_movies_2.loc[892, ['latitude', 'longitude']] = [37.795495, -122.403315]
sf_movies_2.loc[1062, ['latitude', 'longitude']] = [37.826498, -122.507031]
sf_movies_2.loc[1603, ['latitude', 'longitude']] = [37.749017, -122.387184]
sf_movies_2.loc[241, ['latitude', 'longitude']] = [37.752627, -122.391733]
sf_movies_2.loc[277, ['latitude', 'longitude']] = [37.779246, -122.395346]
sf_movies_2.loc[213, ['latitude', 'longitude']] = [37.780492, -122.411331]
sf_movies_2.loc[344, ['latitude', 'longitude']] = [37.764125, -122.421900]
sf_movies_2.loc[379, ['latitude', 'longitude']] = [37.766760, -122.402784]
sf_movies_2.loc[420, ['latitude', 'longitude']] = [37.760558, -122.399245]
sf_movies_2.loc[443, ['latitude', 'longitude']] = [37.773868, -122.384360]
sf_movies_2.loc[1664, ['latitude', 'longitude']] = [37.791459, -122.427482]
sf_movies_2.loc[914, ['latitude', 'longitude']] = [37.752818, -122.409123]
sf_movies_2.loc[934, ['latitude', 'longitude']] = [37.726796, -122.444600]
sf_movies_2.loc[652, ['latitude', 'longitude']] = [37.774493, -122.410830]

sf_movies_2.loc[
    [
        17, 1130, 1839, 1557, 479, 482, 506, 668, 755, 799, 855, 996, 1019, 1227, 611, 643, 1584,
        187, 800, 866, 1078, 1126, 652,
        143, 383, 756, 817, 892, 1062, 1603, 241, 277,
        213, 344, 379, 420, 1488, 1853, 978, 724, 996,
        443, 1664, 914, 934, 652
    ],
    'location_cluster'
] = [
    58, 35, 35, 35, 70, 11, 95, 98, 57, 40, 70, 99, 38, 93, 9, 96, 38,
    11, 61, 36, 80, 91, 80,
    66, 36, 23, 83, 11, 30, 13, 97, 93,
    64, 71, 94, 81, 91, 42, 62, 103, 14,
    38, 56, 60, 6, 80
]


sf_movies_2.loc[sf_movies_2['title'] == 'The OA Part II', 'imdb_rating'] = 8.6

sf_movies_2.loc[sf_movies_2['title'] == 'Sonic the Hedgehog', 'genres'] = "Action Adventure Comedy Sci-Fi"

sf_movies_2 = sf_movies_2[:-1]

In [None]:
sf_movies_2.loc[
    [
        837, 858, 979,
        1292,
        294,
        1817,
        1071, 1802
    ],
    'location_cluster'
] = [
    30.0, 30.0, 30.0,
    33.0,
    0.0,
    31.0,
    109.0, 109.0
]

Re-applied the NearestNeighbors model idea from earlier for the remaining points in cluster 0.

In [None]:
# Separate sf_movies_2 into labeled vs. outliers
labeled_df = sf_movies_2[sf_movies_2['location_cluster'] != 0].copy()
outliers_df = sf_movies_2[sf_movies_2['location_cluster'] == 0].copy()

# Calculated the centroid (mean latitude and longitude) for each existing cluster
centroids = []
for cluster_id in labeled_df['location_cluster'].unique():
    cluster_points = labeled_df[labeled_df['location_cluster'] == cluster_id]
    lat_centroid = cluster_points['latitude'].mean()
    lon_centroid = cluster_points['longitude'].mean()
    centroids.append({
        'cluster_id': cluster_id,
        'latitude': lat_centroid,
        'longitude': lon_centroid
    })

centroids_df = pd.DataFrame(centroids) # columns: ['cluster_id', 'latitude', 'longitude']

# Fit a NearestNeighbors model on centroids
knn = NearestNeighbors(n_neighbors=1)
knn.fit(centroids_df[['latitude', 'longitude']])

# Found the nearest centroid
distances, indices = knn.kneighbors(outliers_df[['latitude', 'longitude']])

# Assigned each outlier to the nearest cluster
outliers_df['location_cluster'] = centroids_df.iloc[indices[:, 0]]['cluster_id'].values

# Updated sf_movies_2 with new values
sf_movies_2.loc[outliers_df.index, 'location_cluster'] = outliers_df['location_cluster']

In [None]:
# Updates

sf_movies_2.loc[
    [
        1119,
        1811,
        232, 1171, 1118,
        988,
        1376,
        692,
        1268,
        1532,
        963,
        1709,
        1668,
    ],
    'location_cluster'
] = [
    33.0,
    49.0,
    51.0, 51.0, 51.0,
    58.0,
    53.0,
    55.0,
    57.0,
    -1.0,
    18.0,
    101.0,
    98.0
]

sf_movies_2.drop(index = [1849], inplace = True)

Updated "min_distance_to_tourist_spot" & "is_tourist_spot" columns after updating location cluster values.

In [None]:
# For tourist_spot:

sf_tourist_spots = pd.read_excel("SF Tourist Spots.xlsx") # pre-defined list of tourist spots

def calculate_distance(row, tourist_spots):       # Calculates distance from filming location to nearest tourist spot
    location = (row['latitude'], row['longitude'])
    distances = sf_tourist_spots.apply(lambda x: geodesic(location, (x['Latitude'], x['Longitude'])).meters, axis=1)
    return distances.min()

sf_movies_2['min_distance_to_tourist_spot'] = sf_movies_2.apply(calculate_distance, tourist_spots = sf_tourist_spots, axis = 1)

def classify_by_threshold(df, threshold):         # Classifies location based on threshold--the maximum distance from nearest tourist spot
    return df['min_distance_to_tourist_spot'].apply(lambda x: 1 if x <= threshold else 0)

# Threshold search optimization:
thresholds = np.arange(0, 1001, 50)
results = []
for threshold in thresholds:
    sf_movies_2['is_tourist_spot'] = classify_by_threshold(sf_movies_2, threshold)
    correct_classifications = sf_movies_2['is_tourist_spot'].sum()
    results.append((threshold, correct_classifications))

# Finds optimal threshold:
optimal_threshold, max_correct_classifications = max(results, key=lambda x: x[1])

# Final classification using the optimal threshold:
sf_movies_2['is_tourist_spot'] = classify_by_threshold(sf_movies_2, optimal_threshold)

In [None]:
print("\n Value Counts of Each Cluster:\n")
print(sf_movies_2["location_cluster"].value_counts())


 Value Counts of Each Cluster:

location_cluster
11.0     139
101.0    119
36.0      79
17.0      69
40.0      49
        ... 
7.0        2
39.0       2
109.0      2
13.0       2
22.0       1
Name: count, Length: 109, dtype: int64


In [None]:
# Visualize updated sf_movies_2 points:

# Create base map:
sf_map = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles = "CartoDB positron")
marker_cluster = MarkerCluster().add_to(sf_map)

# Add points from sf_tourist_spots to map as blue markers:
for index, row in sf_tourist_spots.iterrows():
    address_link = f"https://www.google.com/maps/search/?api=1&query={row['Address'].replace(' ', '+')}"
    popup_content = f"""Name: {row['Name']}<br><br>
                        Address: <a href="{address_link}" target="_blank">{row['Address']}</a>"""
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=folium.Popup(popup_content, max_width=150),
        icon=folium.Icon(color='blue')
    ).add_to(marker_cluster)

# Add points from sf_movies_2 to the map as red markers:
for index, row in sf_movies_2.iterrows():
    address_link = f"https://www.google.com/maps/search/?api=1&query={row['locations'].replace(' ', '+')}"
    popup_content = f"""Title: {row['title']}<br><br>
                          Current Index: {row.name}<br><br>
                          Cluster #: {row['location_cluster']}<br><br>
                          Locations: <a href="{address_link}" target="_blank">{row['locations']}</a>"""
    folium.Marker(
                    location=[row['latitude'], row['longitude']],
                    popup=folium.Popup(popup_content, max_width=150),
                    icon=folium.Icon(color='red')
                ).add_to(marker_cluster)
sf_map

# Add search functionality for both datasets:
  # Search keys allow for case-insensitive and partial matching queries

sf_tourist_spots['search_key'] = sf_tourist_spots['Name'].str.lower() + ' ' + sf_tourist_spots['Address'].str.lower()
sf_movies_2['search_key'] = (
    sf_movies_2['title'].str.lower() + ' ' +
    sf_movies_2['release_year'].astype(str) + ' ' +
    sf_movies_2['locations'].str.lower() + ' ' +
    sf_movies_2['director'].str.lower() + ' ' +
    sf_movies_2['writer'].str.lower() + ' ' +
    sf_movies_2['genres'].str.lower() + ' ' +
    sf_movies_2['release_decade'].astype(str)
)

search_tourist_spots = Search(
    layer=marker_cluster,
    search_label='search_key',
    placeholder='Search for tourist spots...',
    collapsed=False
).add_to(sf_map)

search_movies = Search(
    layer=marker_cluster,
    search_label='search_key',
    placeholder='Search for movies...',
    collapsed=False
).add_to(sf_map)

# Add map legend:
legend_html = """
{% macro html(this, kwargs) %}
<div style="
    position: fixed;
    bottom: 50px;
    left: 50px;
    width: 250px;
    height: 80px;
    z-index:9999;
    font-size:14px;
    border: 3px solid #333; /* Dark border color */
    border-radius: 5px; /* Optional: rounded corners for the border */
    padding: 10px; /* Optional: padding inside the legend */
    background-color: #ffffff;
    ">
    <p><a style="color:#d24831;font-size:150%;margin-left:20px;">◼</a>&emsp;Film Locations</p>
    <p><a style="color:#38aadd;font-size:150%;margin-left:20px;">◼</a>&emsp;Tourist Spots</p>
</div>
<div style="
    position: fixed;
    bottom: 50px;
    left: 50px;
    width: 150px;
    height: 80px;
    z-index:9998;
    font-size:14px;
    background-color: #ffffff;
    filter: blur(8px);
    -webkit-filter: blur(8px);
    opacity: 0.7;
    ">
</div>
{% endmacro %}
"""

legend = branca.element.MacroElement()
legend._template = branca.element.Template(legend_html)

# Add the legend to the map
sf_map.add_child(legend)

In [None]:
# Export sorted sf_movies_2 to CSV:

sf_movies_2.to_csv('sorted_sf_movies_2.csv', index=False)
from google.colab import files
files.download('sorted_sf_movies_2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>