# Get Cinemark locations

#### Load Python tools and Jupyter config

In [1]:
import us
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
from time import sleep
import geopandas as gpd
from random import randint
from bs4 import BeautifulSoup
from vega_datasets import data as vega_data
from tqdm.notebook import tqdm, trange

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
place = "cinemark"
place_formal = "Cinemark"
color = "#dd0000"
today = pd.Timestamp.today().strftime("%Y-%m-%d")

---

## Fetch

#### Headers for request

In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

#### Read and parse the theater directory page

In [5]:
base_url = "https://www.cinemark.com"

In [6]:
response = requests.get("https://www.cinemark.com/full-theatre-list", headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

#### Get a list of links to theaters in each state

In [7]:
theater_list = soup.find_all("div", class_="theatres-by-state")

In [8]:
links = []

for t in theater_list:
    for l in t.find_all("a"):
        links.append(base_url + l["href"])

#### Remove any links for "closed" locations

In [9]:
# Such as: https://www.cinemark.com/theatres/az-tucson/century-gateway-12-now-closed
filtered_links = [link for link in links if "closed" not in link]

#### Loop through the list of links, requesting pages and parsing out features

In [10]:
theater_details = []

for link in tqdm(filtered_links):
    link_response = requests.get(link)
    link_soup = BeautifulSoup(link_response.text, "html.parser")

    script_tag = link_soup.find("script", type="application/ld+json")
    if script_tag and script_tag.string:
        script = script_tag.string.strip()
        try:
            json_data = json.loads(script)

            img_tag = link_soup.find("img", class_="img-responsive lazyload")
            if img_tag and "data-src" in img_tag.attrs:
                src_data = img_tag["data-src"]
                latitude, longitude = src_data.split("=")[2].split(",")[
                    0
                ], src_data.split("=")[2].split(",")[1].replace("&key", "")

                street = json_data["address"][0]["streetAddress"]
                city = json_data["address"][0]["addressLocality"]
                zip_code = json_data["address"][0]["postalCode"]
                state = json_data["address"][0]["addressRegion"]
                name = json_data["name"]
                phone = json_data["telephone"]

                theater_dict = {
                    "location_name": name,
                    "street": street,
                    "city": city,
                    "state": state,
                    "zip_code": zip_code,
                    "phone": phone,
                    "longitude": longitude,
                    "latitude": latitude,
                }

                theater_details.append(theater_dict)
                sleep(randint(5, 10))

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {link}: {e}")
            print("Problematic JSON:", script[:200])
    else:
        print(
            f"No valid script tag found in {link}. It might be a rate limiting issue or the page structure has changed."
        )

  0%|          | 0/314 [00:00<?, ?it/s]

Error decoding JSON from https://www.cinemark.com/theatres/oh-macedonia/cinemark-macedonia: Extra data: line 1 column 15009 (char 15008)
Problematic JSON: [{"openingHours":null,"screenCount":0,"branchOf":null,"currenciesAccepted":null,"paymentAccepted":null,"priceRange":"NA","additionalProperty":null,"geo":null,"hasMap":null,"openingHoursSpecification":


#### List of dictionaries with theater information into a dataframe

In [11]:
df = pd.DataFrame(theater_details)

#### How many locations? 

In [12]:
len(df)

313

#### The result?

In [13]:
df.head()

Unnamed: 0,location_name,street,city,state,zip_code,phone,longitude,latitude
0,Cinemark Century Anchorage 16 and XD,301 East 36th Ave,Anchorage,AK,99503,907-770-2602,-149.879106,61.188202
1,Cinemark Bridge Street and XD,370 The Bridge St,Huntsville,AL,35806,256-327-8340,-86.673594,34.719585
2,Cinemark Tinseltown USA Benton,17314 I-30,Benton,AR,72019,501-776-2282,-92.636339,34.54269
3,Cinemark Towne Centre and XD,201 Skyline Dr,Conway,AR,72032,501-450-7558,-92.429867,35.112336
4,Cinemark Colonel Glenn and XD,18 Colonel Glenn Plaza Dr,Little Rock,AR,72210,501-687-0499,-92.405577,34.708475


#### Create a mapping of state abbreviations to full state names using the us library

In [14]:
state_mapping = {state.abbr: state.name for state in us.states.STATES}

#### New column of full state names based on abbreviations

In [15]:
df["state_name"] = df["state"].map(state_mapping)

#### Make sure our brand name gets in the dataframe

In [16]:
df["brand"] = place_formal

#### Add fetch date

In [17]:
df["updated"] = today

---

## Geography

#### Make it a geodataframe

In [18]:
df_geo = df.copy()

In [19]:
gdf = gpd.GeoDataFrame(
    df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude)
).set_crs("4326")

---

## Maps

#### US states background

In [20]:
background = (
    alt.Chart(alt.topo_feature(vega_data.us_10m.url, feature="states"))
    .mark_geoshape(fill="#e9e9e9", stroke="white")
    .properties(width=800, height=500, title=f"{place_formal} locations")
    .project("albersUsa")
)

#### Location points map

In [21]:
points = (
    alt.Chart(gdf)
    .mark_circle(size=5, color=color)
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
    )
)

point_map = background + points
point_map.configure_view(stroke=None)

#### Location proportional symbols map

In [22]:
symbols = (
    alt.Chart(gdf)
    .transform_aggregate(
        latitude="mean(latitude)",
        longitude="mean(longitude)",
        count="count()",
        groupby=["state"],
    )
    .mark_circle()
    .encode(
        longitude="longitude:Q",
        latitude="latitude:Q",
        size=alt.Size("count:Q", title="Count by state"),
        color=alt.value(color),
        tooltip=["state:N", "count:Q"],
    )
    .properties(
        title=f"Number of {place_formal} in US, by average lon/lat of locations"
    )
)

symbol_map = background + symbols
symbol_map.configure_view(stroke=None)

---

## Exports

#### JSON

In [23]:
df.to_json(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.json",
    indent=4,
    orient="records",
)

#### CSV

In [24]:
df.to_csv(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.csv", index=False
)

#### GeoJSON

In [25]:
gdf.to_file(
    f"data/processed/{place.lower().replace(' ', '_')}_locations.geojson",
    driver="GeoJSON",
)