In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# takeout_metadata.ipynb
# Adapted from ATLAS 4519 (Fall 2023)

# We import the pandas and matplotlib modules
# We can use the keyword "as" to give more convenient names to modules
import pandas as pd
import matplotlib.pyplot as plt
#% matplotlib inline

# Import os and zipfile modules for file management
import os
import tarfile

# Import json module for parsing metadata
import json

from datetime import datetime # https://stackoverflow.com/questions/15707532/import-datetime-v-s-from-datetime-import-datetime
import numpy as np

In [None]:
# Define python list to use as column headers for pandas data frame
data_columns = ['title', 'viewCount', 'takenTime', 'latitude', 'longitude', 'people', 'origin']

# Create new dataframe using columns list
df = pd.DataFrame(columns=data_columns)

# Variable to keep track of how many non-json files get removed
# (should be equal to number of json files)
num_deleted = 0

In [None]:
# Walk the directory, from https://stackoverflow.com/questions/19587118/iterating-through-directories-with-python
rootdir = '/content/drive/MyDrive/takeout_metadata' # path to the folder that holds your colab and zip files
for subdir, dirs, files in os.walk(rootdir):
    # print('subdir:')
    # print(subdir)
    # print('dirs:')
    # print(dirs)
    # print('files:')
    # print(files)
    # ^ these may be useful to better understand where you are as you walk through the filepaths
    for file in files:
      filepath = os.path.join(subdir, file) # create a string variable for your filepath

      # Extract all zip files using the zipfile module
      if filepath.endswith(".tgz"):
        # with tarfile.open(filepath) as tar:
        #   tar.extractall(os.path.join(subdir, 'uncompressed'))
        os.remove(filepath) # remove zip after file has been extracted

      # Open all json files using the json module and load into json object
      #if filepath.endswith(".PNG.json") or filepath.endswith(".JPG.json") or filepath.endswith(".HEIC.json") or filepath.endswith(".png.json") or filepath.endswith(".jpg.json") or filepath.endswith(".heic.json") or filepath.endswith(".JPEG.json") or filepath.endswith(".jpeg.json") or filepath.endswith(".mov.json") or filepath.endswith(".mp4.json") or filepath.endswith(".MOV.json") or filepath.endswith(".MP4.json"):
      if filepath.endswith(".json"):
        with open(filepath) as f:
          imgData = json.load(f)

          # Create temporary variables to store your metadata from the json object
          if "title" in imgData:
            loadTitle = imgData["title"]
          else:
            loadTitle = "none"
          if "imageViews" in imgData:
            loadViewCount = imgData["imageViews"]
          else:
            loadViewCount = [0]
          # loadTakenTime = imgData['photoTakenTime']['timestamp'] # Use 'timestamp' instead of 'formatted' if bringing data into mapbox
          if "photoTakenTime" in imgData:
            loadTakenTime = imgData['photoTakenTime']['formatted']
          else:
            loadTakenTime = ""
          if "geoData" in imgData:
            loadLatitude = imgData['geoData']['latitude']
            loadLongitude = imgData['geoData']['longitude']
          else:
            loadLatitude = 0
            loadLongitude = 0
          if "people" in imgData:
            loadPeople = imgData['people']
            #print(loadPeople)
          else:
            loadPeople = "none"
          if "googlePhotosOrigin" in imgData:
            loadOrigin = imgData['googlePhotosOrigin']
          else:
            loadOrigin = "none"

          # Create a list object with metadata from temp variables
          img = [loadTitle, loadViewCount, loadTakenTime, loadLatitude, loadLongitude, loadPeople, loadOrigin]

          # Append list object to dataframe at next available index
          df.loc[len(df.index)] = img
          continue

      # Avoid deleting this file while you are in the same directory
      # Change to just ".py" if this is run through computer terminal
      if filepath.endswith(".ipynb"):
        continue

      # This is the case where the file is the original jpg/png/heic/mp4/mov/etc.
      # BE CAREFUL. IF THE WRONG ROOT DIRECTORY IS USED IT WILL DELETE ALL YOUR FILES. (you can probably still find them in your trash, unless you're running this through the terminal, then you're f*cked)
      else:
        os.remove(filepath)
        num_deleted += 1

print(num_deleted)
print(df.shape) # first value should be equal to num_deleted, second value should be 5
print(df.head()) # shows the first 5 lines of the dataframe; most likely a lot of the lat/lon values will be 0

In [None]:
# OPTIONAL
# Export dataframe to csv if you don't care about location data
df.to_csv('photoNoLocation.csv')

In [None]:
# Clean data and export to csv
filtered_df = df[df['latitude'] != 0]
print(filtered_df.shape) # notice the first value will be significantly lower than before
filtered_df.to_csv('photoLocation.csv')

In [None]:
# TIME DATA PT 1 - YEARS
# Which year were most of your pictures taken?

# Define python list for column headers
time_data_columns = ['month', 'year', 'hour']
time_df = pd.DataFrame(columns=time_data_columns)

# Convert 'takenTime' column to type datetime
df['takenTime'] = pd.to_datetime(df['takenTime'], format='%b %d, %Y, %I:%M:%S %p %Z')

# Iterate through 'takenTime' column
for time in df['takenTime']:

  # Create temporary variables to store your
  parseMonth = time.month
  parseYear = time.year
  parseHour = time.hour

  # Create a list object with metadata from temp variables
  imgTime = [parseMonth, parseYear, parseHour]

  # Append list object to dataframe at next available index
  time_df.loc[len(time_df.index)] = imgTime

#print(time_df)

# Create a pandas series from the year/month/hour values
years = time_df['year']
months = time_df['month']
hours = time_df['hour']
#print(years)

# Create a pandas series counting how often each year/month/hour occurs
countOfYears = years.value_counts()
countOfMonths = months.value_counts()
countOfHours = hours.value_counts()

# Sort chronologically
countOfYears = countOfYears.sort_index()
countOfMonths = countOfMonths.sort_index()
countOfHours = countOfHours.sort_index()
#print(countOfHours)

# Add indexes for missing years if there are any
minIndex = countOfYears.index[0]
maxIndex = countOfYears.index[-1]
newIndex = list(range(minIndex, maxIndex+1))
countOfYearsReIndexed = countOfYears.reindex(newIndex)

# Add indexes for missing months if there are any
newMonths = list(range(1, 13))
countOfMonthsReIndexed = countOfMonths.reindex(newMonths)

# Add indexes for missing hours if there are any
newHours = list(range(24))
countOfHoursReIndexed = countOfHours.reindex(newHours)

# Plot the counts of years as a bar chart
countOfYearsReIndexed.plot(
    kind='bar',
    xlabel='Year',
    ylabel='Images',
    figsize=(8,5)
)

In [None]:
# TIME DATA PT 2 - MONTHS
# Which month were most of your pictures taken?

# Plot the counts of months as a bar chart
countOfMonthsReIndexed.plot(
    kind='bar',
    xlabel='Month',
    ylabel='Images',
    figsize=(6,4)
)

In [None]:
# TIME DATA PT 3 - HOURS
# Which hour were most of your pictures taken?

# Plot the counts of months as a bar chart
countOfHoursReIndexed.plot(
    kind='bar',
    xlabel='Hour',
    ylabel='Images',
    figsize=(8,5)
)

In [None]:
# GEODATA Y/N
# Do your photos typically have geodata associated with them?

# Create new dataframes filtering by the latitude value
df_geo = df[df['latitude'] != 0]
df_no_geo = df[df['latitude'] == 0]

# Print the length of these dataframes
print("The number of photos with geodata: "+str(len(df_geo)))
print("The number of photos without geodata: "+str(len(df_no_geo)))

In [None]:
# VIEW COUNTS
# How often on average do you look back at a photograph once you've stored it in Google Photos? Let's find out.

# Create a pandas Series from the viewCount values
viewCount = df["viewCount"]

print("The average number of times you view an image: " +str(viewCount.astype('int').mean()))

# Create a pandas series counting how often each viewCount occurs
countOfCounts = viewCount.value_counts()
# Convert the index values from objects to integers and sort
countOfCounts.index = countOfCounts.index.astype('int')
countOfCounts = countOfCounts.sort_index()

#print(countOfCounts)
maxIndex = countOfCounts.index[-1]
print("The most times you've viewed one image: " + str(maxIndex))

# Add indexes for missing counts if there are any
newIndex = list(range(maxIndex+1))
countOfCountsReIndexed = countOfCounts.reindex(newIndex)

print("The number of photos with only one view: " + str(countOfCountsReIndexed[1]))
print("The number of photos with no views: " + str(countOfCountsReIndexed[0]))

# Plot the counts as a bar chart
countOfCountsReIndexed.plot(
    kind='bar',
    xlabel='View Counts',
    ylabel='Images',
    xticks=[0,22,44,66,88,110,132,154],
    figsize=(12,6))
# We could even do some ditribution analysis on this... but I think the graph already
# gives a pretty clear picture of image viewing habits.

In [None]:
for index, item in df.iterrows():
  if item['viewCount'] == '124':
    print(item)

In [None]:
# TIME x GEODATA
# Which years were more likely to contain geodata?

# Create dataframe for images that include geodata
time_df_geo = pd.DataFrame(columns=time_data_columns)

# Iterate through 'takenTime' column
# Change to 'df_no_geo' to compare
for dt in df_geo['takenTime']:

  # Create temporary variables to store your
  parseMonth = dt.month
  parseYear = dt.year
  parseHour = dt.hour

  # Create a list object with metadata from temp variables
  imgTime = [parseMonth, parseYear, parseHour]

  # Append list object to dataframe at next available index
  time_df_geo.loc[len(time_df_geo.index)] = imgTime

#print(time_df)

# Create a pandas series from the year values
geo_years = time_df_geo['year']

# Create a pandas series counting how often each year occurs
countOfGeoYears = geo_years.value_counts()

# Sort chronologically
countOfGeoYears = countOfGeoYears.sort_index()

# Create dataframe for images that do not include geodata
time_df_no_geo = pd.DataFrame(columns=time_data_columns)

# Iterate through 'takenTime' column
for dt in df_no_geo['takenTime']:
  # Create temporary variables to store your
  parseMonth = dt.month
  parseYear = dt.year
  parseHour = dt.hour

  # Create a list object with metadata from temp variables
  imgTime = [parseMonth, parseYear, parseHour]

  # Append list object to dataframe at next available index
  time_df_no_geo.loc[len(time_df_no_geo.index)] = imgTime

# Create a pandas series from the year values
no_geo_years = time_df_no_geo['year']

# Create a pandas series counting how often each year occurs
countOfNoGeoYears = no_geo_years.value_counts()

# Sort chronologically
countOfNoGeoYears = countOfNoGeoYears.sort_index()

# Compare lowest and highest years of geodata and no geodata dataframes
minIndexGeoYears = countOfGeoYears.index[0]
maxIndexGeoYears = countOfGeoYears.index[-1]
minIndexNoGeoYears = countOfNoGeoYears.index[0]
maxIndexNoGeoYears = countOfNoGeoYears.index[-1]

# Ensure that geodata years and no geodata years counts have aligned/equal index values
if minIndexGeoYears < minIndexNoGeoYears:
  minIndex = minIndexGeoYears
else:
  minIndex = minIndexNoGeoYears
if maxIndexGeoYears < maxIndexNoGeoYears:
  maxIndex = maxIndexGeoYears
else:
  maxIndex = maxIndexNoGeoYears
newIndex = list(range(minIndex, maxIndex+1))
countOfGeoYearsReIndexed = countOfGeoYears.reindex(newIndex)
countOfNoGeoYearsReIndexed = countOfNoGeoYears.reindex(newIndex)

# Compare dataframes
countOfGeoYearsReIndexed.compare(countOfNoGeoYearsReIndexed, result_names=("Images with geodata", "Images with no geodata"))

In [None]:
# https://www.datacamp.com/tutorial/matplotlib-time-series-line-plot