# Get your check in data from Swarm

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import requests
import json
import sqlite3
from datetime import datetime
import keys



In [3]:
import altair as alt
import altair_latimes as lat

alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

---

In [5]:
today = datetime.today().strftime("%Y-%m-%d")

### Get the data using this tool from [Swarm-to-SQLite](https://github.com/dogsheep/swarm-to-sqlite)

In [6]:
### Pass in your oauth_token from what you store in keys.py

In [7]:
!swarm-to-sqlite data/raw/checkins.db --token={keys.TOKEN}

[?25lImporting 5481 checkins  [###################################-]   98%  00:00:01[?25h
Traceback (most recent call last):
  File "/Users/mhustiles/.pyenv/versions/3.8.11/bin/swarm-to-sqlite", line 8, in <module>
    sys.exit(cli())
  File "/Users/mhustiles/.pyenv/versions/3.8.11/lib/python3.8/site-packages/click/core.py", line 1137, in __call__
    return self.main(*args, **kwargs)
  File "/Users/mhustiles/.pyenv/versions/3.8.11/lib/python3.8/site-packages/click/core.py", line 1062, in main
    rv = self.invoke(ctx)
  File "/Users/mhustiles/.pyenv/versions/3.8.11/lib/python3.8/site-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/mhustiles/.pyenv/versions/3.8.11/lib/python3.8/site-packages/click/core.py", line 763, in invoke
    return __callback(*args, **kwargs)
  File "/Users/mhustiles/.pyenv/versions/3.8.11/lib/python3.8/site-packages/swarm_to_sqlite/cli.py", line 77, in cli
    ensure_foreign_keys(db)
  File "/User

### Read the sqlite database we downloaded

In [8]:
import sqlite3

# Create a SQL connection to our SQLite database
cnx = sqlite3.connect("data/raw/checkins.db")

### Create dataframes for various tables in the DB

In [9]:
venues = pd.read_sql_query("SELECT * FROM venues", cnx)
checkins = pd.read_sql_query("SELECT * FROM checkins", cnx)
categories = pd.read_sql_query("SELECT * FROM categories", cnx)
categories_venues = pd.read_sql_query("SELECT * FROM categories_venues", cnx)

### Merge categories and venues so you can attach categories to the venues in your checkins

In [10]:
category_lookup = pd.merge(
    categories_venues, categories, left_on="categories_id", right_on="id"
)

In [11]:
venue_lookup = pd.merge(categories_venues, venues, left_on="venues_id", right_on="id")

### Merge checkins and venues

In [12]:
src = pd.merge(checkins, venue_lookup, left_on="venue", right_on="id")

In [13]:
df = pd.merge(src, category_lookup, on="venues_id")

### We don't need all the fields

In [14]:
df_slim = df[
    [
        "id_x",
        "timeZoneOffset",
        "isMayor",
        "created",
        "name_x",
        "venue",
        "address",
        "crossStreet",
        "postalCode",
        "city",
        "state",
        "country",
        "latitude",
        "longitude",
        "pluralName",
    ]
].copy()

In [15]:
df_slim.sort_values("created", ascending=False).head()

Unnamed: 0,id_x,timeZoneOffset,isMayor,created,name_x,venue,address,crossStreet,postalCode,city,state,country,latitude,longitude,pluralName
1,6112b21a60843366dbead0c2,-420,1,2021-08-10T17:06:34,Los Angeles Times,5b4bd8aacad1b60036b3c14f,2300 E Imperial Hwy,Douglas Street,90245,El Segundo,CA,United States,33.930487,-118.382373,Offices
110,61128b1054fee338b7dbcf2a,-420,0,2021-08-10T14:20:00,Los Angeles International Airport (LAX),439ec330f964a520102c1fe3,1 World Way,,90045,Los Angeles,CA,United States,33.94368,-118.406074,Airports
123,6111ce706ed68d55f4e98345,-420,0,2021-08-10T00:55:12,Sizzler,4a837c4ef964a5204bfb1fe3,5801 Sepulveda Blvd,,90230,Culver City,CA,United States,33.987015,-118.39588,Steakhouses
128,6111afcbf99ad353aae487ac,-420,0,2021-08-09T22:44:27,CVS pharmacy,4b022b0df964a5200e4822e3,13171 Mindanao Way,,90292,Marina del Rey,CA,United States,33.984482,-118.438882,Pharmacies
2,6111323234a951083295d361,-420,1,2021-08-09T13:48:34,Los Angeles Times,5b4bd8aacad1b60036b3c14f,2300 E Imperial Hwy,Douglas Street,90245,El Segundo,CA,United States,33.930487,-118.382373,Offices


### Clean up the dates

In [16]:
df_slim["date"] = pd.to_datetime(df_slim["created"], format="%Y-%m-%d")

In [17]:
df_slim["date"] = pd.to_datetime(df_slim["date"].dt.strftime("%Y-%m-%d"))

### Get segments of dates in case we need them for grouping

In [18]:
df_slim["year"] = df_slim["date"].dt.year.astype(str)
df_slim["quarter"] = df_slim["date"].dt.quarter.astype(str)
df_slim["day"] = df_slim["date"].dt.day.astype(str)
df_slim["month"] = df_slim["date"].dt.month.astype(str)
df_slim["weekday"] = df_slim["date"].dt.day_name()
df_slim["monthname"] = df_slim["date"].dt.month_name()
df_slim["monthday"] = df_slim["date"].dt.strftime("%m-%d")
df_slim["monthyear"] = df_slim["date"].dt.strftime("%m-%Y")

### Clean up columns

In [19]:
df_slim.columns = df_slim.columns.str.lower()

In [20]:
df_slim.rename(columns={"id_x": "checkinid", "name_x": "place"}, inplace=True)

---

### Change over time

In [21]:
places_time_grouped = (
    df_slim.groupby(["year"]).agg({"checkinid": "count"}).reset_index()
)

In [22]:
places_time_grouped.rename(columns={"checkinid": "count"}, inplace=True)

In [23]:
places_time_grouped.head(20)

Unnamed: 0,year,count
0,2010,292
1,2011,538
2,2012,1131
3,2013,683
4,2014,293
5,2015,239
6,2016,419
7,2017,411
8,2018,407
9,2019,624


In [24]:
alt.Chart(places_time_grouped).mark_bar().encode(x="year", y="count").properties(
    width=600, height=400
)

---

### Venue types

In [25]:
df_slim.sort_values("created", ascending=False).head()

Unnamed: 0,checkinid,timezoneoffset,ismayor,created,place,venue,address,crossstreet,postalcode,city,state,country,latitude,longitude,pluralname,date,year,quarter,day,month,weekday,monthname,monthday,monthyear
1,6112b21a60843366dbead0c2,-420,1,2021-08-10T17:06:34,Los Angeles Times,5b4bd8aacad1b60036b3c14f,2300 E Imperial Hwy,Douglas Street,90245,El Segundo,CA,United States,33.930487,-118.382373,Offices,2021-08-10,2021,3,10,8,Tuesday,August,08-10,08-2021
110,61128b1054fee338b7dbcf2a,-420,0,2021-08-10T14:20:00,Los Angeles International Airport (LAX),439ec330f964a520102c1fe3,1 World Way,,90045,Los Angeles,CA,United States,33.94368,-118.406074,Airports,2021-08-10,2021,3,10,8,Tuesday,August,08-10,08-2021
123,6111ce706ed68d55f4e98345,-420,0,2021-08-10T00:55:12,Sizzler,4a837c4ef964a5204bfb1fe3,5801 Sepulveda Blvd,,90230,Culver City,CA,United States,33.987015,-118.39588,Steakhouses,2021-08-10,2021,3,10,8,Tuesday,August,08-10,08-2021
128,6111afcbf99ad353aae487ac,-420,0,2021-08-09T22:44:27,CVS pharmacy,4b022b0df964a5200e4822e3,13171 Mindanao Way,,90292,Marina del Rey,CA,United States,33.984482,-118.438882,Pharmacies,2021-08-09,2021,3,9,8,Monday,August,08-09,08-2021
2,6111323234a951083295d361,-420,1,2021-08-09T13:48:34,Los Angeles Times,5b4bd8aacad1b60036b3c14f,2300 E Imperial Hwy,Douglas Street,90245,El Segundo,CA,United States,33.930487,-118.382373,Offices,2021-08-09,2021,3,9,8,Monday,August,08-09,08-2021


In [26]:
venues_grouped = (
    df_slim.groupby(["pluralname"])
    .agg({"checkinid": "count"})
    .reset_index()
    .sort_values("checkinid", ascending=False)
)

In [27]:
venues_grouped.rename(columns={"checkinid": "count"}, inplace=True)

In [28]:
venues_grouped.head()

Unnamed: 0,pluralname,count
216,Offices,603
73,Coffee Shops,430
5,Airports,331
194,Metro Stations,232
113,Event Spaces,210


---

### Countries

In [29]:
countries_grouped = (
    df_slim.groupby(["country"])
    .agg({"checkinid": "count"})
    .reset_index()
    .sort_values("checkinid", ascending=False)
)

In [30]:
countries_grouped.rename(columns={"checkinid": "count"}, inplace=True)

In [31]:
countries_grouped.head()

Unnamed: 0,country,count
26,United States,4001
21,South Korea,1053
23,Taiwan,55
14,Japan,29
11,Ireland,29


---

### Make it a geodataframe

In [32]:
gdf = gpd.GeoDataFrame(
    df_slim, geometry=gpd.points_from_xy(df_slim.longitude, df_slim.latitude)
)

In [33]:
gdf.plot()

### Export

In [36]:
gdf.to_file("data/processed/checkins_geo.geojson", driver="GeoJSON")