In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
import h3

import time
import datetime

**1 -- home detection based on h3 hexagons**

In [2]:
# parameters
h3_resolution = 10
minimum_checkins = 10

In [3]:
# check-in data
cdf = pd.read_csv("../data/Gowalla_totalCheckins.txt", sep="\t", header=None)
cdf.columns = ["user", "checkin_time", "lat", "lon", "location_id"]

In [4]:
# create an hour column
cdf["checkin_time"] = pd.to_datetime(cdf["checkin_time"])
cdf["hour"] = cdf["checkin_time"].dt.hour

In [5]:
def coords_to_point(lat, lon):
    """create Point geometry from lat, lon columns"""
    geometry = [Point(xy) for xy in zip(lon, lat)]
    return geometry

def point_geometry_to_h3(geodf, geometry_col, h3_resolution):
    """h3 from Point geometry"""
    h3_col = geodf.apply(lambda r: h3.geo_to_h3(r[geometry_col].y, r[geometry_col].x, h3_resolution), axis=1)
    return h3_col


In [6]:
# filter cdf by input parameters
cdf["nr_locations"] = cdf.groupby(["user"])["location_id"].transform("nunique")
lcdf = cdf[cdf["nr_locations"] >= minimum_checkins]

In [7]:
# construct h3 codes
start_time = time.time()
lcdf["point_geom"] = coords_to_point(lcdf["lat"], lcdf["lon"])
lcdf["h3"] = point_geometry_to_h3(lcdf, geometry_col="point_geom", h3_resolution=h3_resolution)
print("--- %s seconds ---" % round((time.time() - start_time), 3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lcdf["point_geom"] = coords_to_point(lcdf["lat"], lcdf["lon"])


--- 83.141 seconds ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lcdf["h3"] = point_geometry_to_h3(lcdf, geometry_col="point_geom", h3_resolution=h3_resolution)


In [8]:
# select the most visited h3 by user
def home_table(df, min_locations_in_h3, min_visits_in_h3):
    """function to select home h3 of users"""
    
    # variables to support selection
    h3_selection = df.groupby(["user", "h3"]).agg(
        locations_in_h3 = pd.NamedAgg("location_id", "nunique"),
        visits_in_h3 = pd.NamedAgg("location_id", "count")
        ).reset_index()

    h3_selection["max_locations_in_h3"] = h3_selection.groupby(["user"])["locations_in_h3"].transform("max")
    h3_selection["max_visits_in_h3"] = h3_selection.groupby(["user"])["visits_in_h3"].transform("max")

    # filter
    h3_selection = h3_selection[
        (h3_selection["visits_in_h3"] == h3_selection["max_visits_in_h3"]) &
        (h3_selection["max_locations_in_h3"] >= min_locations_in_h3) &
        (h3_selection["max_visits_in_h3"] >= min_visits_in_h3)
    ]
    
    return h3_selection

In [9]:
# no home hours focus
home_df = home_table(lcdf, min_locations_in_h3=1, min_visits_in_h3=5)

# export
home_df.to_csv("../outputs/home_location.csv", sep=";", index=False)

In [10]:
# filter for checkins in home hours -- same as for the twitter data
hh_lcdf = lcdf[
    (lcdf["hour"] >= 20) |
    (lcdf["hour"] <= 7)
]

home_df2 = home_table(hh_lcdf, min_locations_in_h3=1, min_visits_in_h3=5)

# export
home_df2.to_csv("../outputs/home_location.csv", sep=";", index=False)

**add home locations to edgelist**

In [None]:
# edgelist
edf = pd.read_csv("../data/Gowalla_edges.txt", sep="\t")
edf.columns = ["user1", "user2"]