In [123]:
import os
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
import json

import googlemaps
from datetime import datetime
import time 

gmaps = googlemaps.Client(key=os.getenv("GOOGLE_KEY"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

load_dotenv()

True

# Data enrichment

This notebook handles the enrichment of the dataset by retrieving additional information from external APIs.

The main steps include:

- **Gender Imputation**: Uses the OpenAI API to infer missing customer genders based on their first names.
- **Customer Address Enrichment**: Uses the Google Maps API to retrieve full address information for customers based on partial address data.
- **Library Address Enrichment**: Applies the same address enrichment process to libraries using the Google Maps API.
- **Distance Calculation**: Calculates driving distance between each customer and their associated library using the Google Maps Distance Matrix API.
- **Weather Data Retrieval**: Fetches daily weather data (temperature, precipitation, snow) for the years 2018 and 2019 using the Meteostat API.


## Get gender for missing entries in customer table

This code section requires opnai client and api key (requires openai credits).

In [None]:
# Load data
file_path = 'data/customers.csv'
customers_df = pd.read_csv(file_path, index_col='id')

# Get slice of dataframe with missing entries
missing_gender_df = customers_df[customers_df['gender'].isna()]
missing_gender_df.head()

# Get openAI client
client = OpenAI(api_key=os.getenv("OPEN_API_KEY"))

# Function that inffers gender based on name
def get_gender(name: str) -> str:
    prompt = f"Is the name '{name}' typically male or female? Always answer with 'male' or 'female'. Never ourput anything else but 'male' or 'female'"
    
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )
    return response.choices[0].message.content.strip().lower()

# Map names to genders for missing entires
map_dict = {}

for idx, row in missing_gender_df.iterrows():
    
    customer_id = idx
    customer_name = row['name']
    
    map_dict[customer_id] = get_gender(customer_name)
    print(f'{map_dict[customer_id]} - {customer_name}')
    
# Save results to mapping file
with open("new_data/gender_map.json", "w") as file:
    json.dump(map_dict, file, indent=2)

male - Ronald Cedillo
female - Stacy Dangelo
male - William  Barrera 
male - Matthew Majszak
female - Mary Riggs
female - Maria Morris
female - PAMELA THOMAS
male - Donald Duran
female - Angela Dipaolo
male - Kevin Fuentes
female - Elizabeth Lugo
male - James Villacres
male - Joel Riordan
female - Debra Stephens
female - Earnestine Bothe
male - Jeffrey Matusz
male - Robert Weber
female - Elisa Venn
male - John Sandoval
female - Florence Rhoads
female - christina GOMEZ
male - Christopher Bohland
female - Angle Taub
male - Sam Summers
male - Ricky Husky
female - Beatrice Murphy
male - Luke Malmquist
male - Vince Farrell
male - Douglas Johnson
male - Willie Fairchild
male - Gonzalo Cox
female - Edith Johna
female - Joline Carraro
female - Wendy Gonzalez
female - Ellen Osborne
female - Ruth Dickes
female - Kaylene Dunn
female - Kasie Rodriguez
male - Frank Parker
female - Darlene Gridley
male - Jeffrey Cabriales
female - Sandra Franklin
female - Mercedes Deniz
female -  Ernestine Pederson 

## Get location data for customers

Uses Google Cloud's Maps API client to retrieve the full address for each client based on available data.

Requires a valid Google Maps API key.

Note: To use the Google Maps API, you must have a Google Cloud account with billing enabled. (first 10k api calls are free)


In [None]:
file_path = 'data/customers.csv'
customers_df = pd.read_csv(file_path, index_col='id')

for count, idx in enumerate(customers_df.index):
    
    # Get google maps query by using provided adress, city and state
    adress = customers_df.loc[idx, 'street_address']
    
    if not pd.isna(customers_df.loc[idx, 'city']):
        adress = adress + ' ' + customers_df.loc[idx, 'city']
    else:
        adress = adress + ' ' + 'Portland' # using portland if city is missing since portland is most frequent
    if not pd.isna(customers_df.loc[idx, 'state']):
        adress = adress + ' ' + customers_df.loc[idx, 'state']
        
    # Query google maps
    geo_result = gmaps.geocode(adress)

    # Get formatted full adress, latitude and longitude
    customers_df.loc[idx, 'full_adress'] = geo_result[0]['formatted_address']
    customers_df.loc[idx, 'latitude'] = geo_result[0]['geometry']['location']['lat']
    customers_df.loc[idx, 'longitude'] = geo_result[0]['geometry']['location']['lng']
    
    # Sleep in order not to exceed allowed rate limit
    time.sleep(0.05)
    
    # Print progress
    if count % 100 == 0:
        print(f'Progess {count} / {customers_df.shape[0]}')


Progess 0 / 2000
Progess 100 / 2000
Progess 200 / 2000
Progess 300 / 2000
Progess 400 / 2000
Progess 500 / 2000
Progess 600 / 2000
Progess 700 / 2000
Progess 800 / 2000
Progess 900 / 2000
Progess 1000 / 2000
Progess 1100 / 2000
Progess 1200 / 2000
Progess 1300 / 2000
Progess 1400 / 2000
Progess 1500 / 2000
Progess 1600 / 2000
Progess 1700 / 2000
Progess 1800 / 2000
Progess 1900 / 2000


In [None]:
# Save customer geo data
customers_adress_df = customers_df[['full_adress', 'latitude', 'longitude']]
customers_adress_df.to_csv('new_data/customer_location_map.csv')

## Get location data for libraries

In [None]:
# Get full adress data for each library based on provided adress and library name

file_path = 'data/libraries.csv'
libraries_df = pd.read_csv(file_path, index_col='id')

for count, idx in enumerate(libraries_df.index):
    
    query = libraries_df.loc[idx, 'name'] + ' ' + libraries_df.loc[idx, 'street_address']
    if not pd.isna(libraries_df.loc[idx, 'city']):
        query = query + ' ' + libraries_df.loc[idx, 'city']
            
    geo_result = gmaps.geocode(query)

    libraries_df.loc[idx, 'full_adress'] = geo_result[0]['formatted_address']
    libraries_df.loc[idx, 'latitude'] = geo_result[0]['geometry']['location']['lat']
    libraries_df.loc[idx, 'longitude'] = geo_result[0]['geometry']['location']['lng']
    time.sleep(0.05)
    
    if count % 100 == 0:
        print(f'Progess {count} / {libraries_df.shape[0]}')
        
# Save library with added data
libraries_df.to_csv('data_preprocessed/libraries_with_geo.csv')


Progess 0 / 18


## Calculate distances between customers and libraries

Join checkouts, customers, and libraries to obtain both customer and library addresses in a single dataset.

These addresses are used to calculate driving distance between each customer and their library.

Driving distance is chosen as a proxy, as it also reflects general accessibility via walking or public transport.


In [None]:
# Merge tables

file_path = 'data/checkouts.csv'
checkouts_df = pd.read_csv(file_path)

checkouts_df = checkouts_df.join(
    customers_adress_df.add_prefix('cust_'), 
    on="patron_id", 
    how="left"
)

checkouts_df = checkouts_df.join(
    libraries_df[['full_adress']].add_prefix('lib_'), 
    on="library_id", 
    how="left"
)

In [None]:
# Iterate dataframe rows
# For each row, get driving distance and duration by using google maps matrxi api

distance_dict = {}

for idx, row in checkouts_df.iterrows():
    
    gmap_dist = gmaps.distance_matrix(
        origins=[row['cust_full_adress']],
        destinations=[row['lib_full_adress']],
        mode='driving',
        units='metric'
    )
    
    distance = gmap_dist['rows'][0]['elements'][0]['distance']['value']/1000
    duration = gmap_dist['rows'][0]['elements'][0]['duration']['value']/3600
    
    distance_dict[idx] = {
        'customer_id': row['patron_id'],
        'library_id': row['library_id'],
        'distance': distance,
        'duration': duration
    }
    time.sleep(0.05)
    
    if idx % 100 == 0:
        print(f'Progess {idx} / {checkouts_df.shape[0]}')
        
cust_lib_distance_df = pd.DataFrame.from_dict(distance_dict, orient='index')
cust_lib_distance_df.to_csv('new_data/lib_cust_dist.csv')

Progess 0 / 2000
Progess 100 / 2000
Progess 200 / 2000
Progess 300 / 2000
Progess 400 / 2000
Progess 500 / 2000
Progess 600 / 2000
Progess 700 / 2000
Progess 800 / 2000
Progess 900 / 2000
Progess 1000 / 2000
Progess 1100 / 2000
Progess 1200 / 2000
Progess 1300 / 2000
Progess 1400 / 2000
Progess 1500 / 2000
Progess 1600 / 2000
Progess 1700 / 2000
Progess 1800 / 2000
Progess 1900 / 2000


## Get weather data

Weather data for Portland was obtained via meteostat library.

In [9]:
from meteostat import Point, Daily
from datetime import datetime

# Define location (e.g., Portland, OR)
location = Point(45.5152, -122.6784)  # lat, lon

# Define date range
start = datetime(2018, 1, 1)
end = datetime(2020, 1, 1)

# Get daily data
weather_data = Daily(location, start, end)
weather_data = weather_data.fetch()

weather_data.reset_index().to_csv('data_preprocessed/weather_data.csv', index=False)

In [14]:
weather_data['snow'].value_counts()

snow
0.0     724
80.0      3
30.0      2
50.0      2
Name: count, dtype: int64