In [2]:
# Amazon rules the world
# Explore the data to understand more
# ART350 - Dataviz - A4
# data collection overview: https://nijianmo.github.io/amazon/index.html
# create images for your explanatory data poster from these scripts
#-------------------------------------------------------------------------------
# create a movie from saved images, automate the date variation !
# use this to create a data story - by applying it to plots created to visualize static data
#-------------------------------------------------------------------------------
import os
import json
import gzip
import pandas
from urllib.request import urlopen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
#get the dataset
!wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz

In [4]:
#set up your local datapath
from google.colab import drive
drive.mount('/content/drive')
#change this based on your setup
root = '/content/drive/MyDrive/ART/data_visualization/'
datapath =  root + 'data/movie/'

Mounted at /content/drive


In [5]:
### load the meta data
data = []
with gzip.open('meta_Computers.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))  
# total length of list, this number equals total number of products

# convert list into pandas dataframe
df = pandas.DataFrame.from_dict(data)

In [None]:
#remove some of the columns
unwanted_columns = ['tech1','tech2','details','similar_item','image','also_buy', 'also_view', 'asin']
df.drop(unwanted_columns, inplace=True, axis=1, errors='ignore')

i=0
for col in df.columns:
    i=i+1
print('number of columns: ', i)

#list the reduce data column names
list(df.columns)

In [None]:
#remove rows with NaN in a subset of the columns
df = df.dropna(subset=['date','price','rank','main_cat'])

#get an overview of the records
print('Rows and columns: ', df.shape)
df.head()

In [8]:
#convert dollars to floats
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('$', '').astype(float)

In [33]:
# now filter the dates (vary start and end to understand change over time)
# change the dates in regular intervals (say evey two years)
# then create the plot, keeping all other aspects (x and y axis constant)
#-------------------------------------------------------------------------------
#create a copy of the dataframe df, called df_t
df_t = df
#format the date column
df_t['date'] = pandas.to_datetime(df_t['date'], format='%B %d, %Y')

# what are the date ranges?
print('start of datasets: ', df.date.min())
print('end of datasets:   ', df.date.max())

start of datasets:  2001-08-14 00:00:00
end of datasets:    2019-01-08 00:00:00


In [None]:
import seaborn
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

seaborn.set_style("whitegrid")

#-------------------------------------------------------------------------------
# this part is new
from dateutil.relativedelta import relativedelta
# set the date format, year-month-day
format_string='%Y-%m-%d'
#select start and and increment
start = '2000-01-01'

#use 2 year intervals
interval = 2

# 10 images at 2 year intervals will cover the 19 years the data covers.
# create 10 images in a loop, starting with image 1 and limit 11
limit = 11

#automatically increment the start and end dates and create a graph for each dataset
for frame in range (1, limit):
  start_date = datetime.strptime(start, format_string).date()
  new_date = start_date + relativedelta(years=interval)
  end_date = datetime.strftime(new_date, format_string).replace(' 0', ' ')

  #limit the temp data set to values between start and end
  df_t = df[(df['date'] >= pandas.to_datetime(start_date)) & (df['date'] <= pandas.to_datetime(end_date))]

  #create a plot with a specific size
  fig, ax = plt.subplots(1, 1, figsize=(10, 10))

  #update the image name
  outputimage = 'amazon' + str(frame) + '.png'


  # this part of the code does not change --------------------------------------

  #pick the variables, color, (c) transparency (alpha), marker style and size (s)
  plt.scatter(df_t.date, df_t.price,  c='orange', alpha=0.6, marker='o', s = 2*df_t.price)
  # turn of the horizontal grid lines
  plt.gca().xaxis.grid(False)
  #rotate the x date ticks
  plt.xticks(rotation='vertical')
  #format the y axix
  fmt = '${x:,.0f}'
  tick = mtick.StrMethodFormatter(fmt)
  ax.yaxis.set_major_formatter(tick)
  #set limits for the y axis - what makes sense?
  min_payment = 0
  max_payment = 250
  ax.set_ylim([min_payment, max_payment])

  #set title, name of output image and save the figure
  titlename = 'amazon - computer related purchases: ' + str(start_date) +  ' to ' + str(end_date)
  plt.title(titlename)
  fig.savefig(datapath+outputimage, dpi=72, bbox_inches = 'tight')
  print('saved figure ' , titlename)

  # the next part changes--------------------------------------------------------
  # make the end the new start
  start = end_date

In [None]:
# now lets make the movie
# check the world of FFMPEG
# https://opensource.com/article/17/6/ffmpeg-convert-media-file-formats
# http://ffmpeg.org/documentation.html

!pip install ffmpeg



In [None]:
# move to the directory where the images are
os.chdir(datapath)
# run the ffmpeg command via bash, save to the output amazondatamovie.mp4
# framerate variable f generates 1/f frames per second;  0.25 results in 1/0.5 = 2s/frame
# amazon%d.png gets all the images called 'amazonX.png' in the working directory

! ffmpeg -framerate 0.5 -i amazon%d.png -c:v libx264 -r 30 -pix_fmt yuv420p amazondatamovie.mp4