In [1]:
# Standard python modules
import os, json, time, urllib.parse

# Provide basepath of this project in your local environment
BASE_PATH = '/Users/sravan/PycharmProjects/DATA-512-HCD/data-512-homework_1'
os.chdir(BASE_PATH)
print(f'Current working directory: {os.getcwd()}')
print(f'List of files in this location')
os.listdir()

Current working directory: /Users/sravan/PycharmProjects/DATA-512-HCD/data-512-homework_1
List of files in this location


['.DS_Store',
 'LICENSE',
 'requirements.txt',
 'intermediate_files',
 '~$dinosaur_genera.cleaned.SEPT.2022.xlsx',
 'JSON_data',
 'README.md',
 'HW1_dino_page_visits_analysis.ipynb',
 '.gitignore',
 'figures',
 '.ipynb_checkpoints',
 'DATA 512 Homework 1 Tharun.ipynb',
 '.git',
 'dinosaur_genera.cleaned.SEPT.2022.xlsx']

In [2]:
# Installing python packages for environment setup requirements
!python -m pip install -r requirements.txt



In [3]:
# The 'requests' module to perform api requests in python
import requests

# The 'pandas' module to perform data aggregations and transformations
import pandas as pd

# The 'matplotlib.pyplot' module to control the seaborn plot display layout
from matplotlib import pyplot as plt 

In [4]:
#    CONSTANTS TO FETCH THE DATA FROM WIKIMEDIA API

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include a "unique ID" that will allow them to
# contact you if something happens - such as - your code exceeding request limits - or some other error happens
REQUEST_HEADERS = {
    'User-Agent': '<sravankr@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "",             # this value will be set for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",
    "end":         "2022093000"    # this is likely the wrong end date
}

In [5]:
# Creating a list of articles we are interested in analysing
article_df = pd.read_excel('dinosaur_genera.cleaned.SEPT.2022.xlsx')
article_df['name'].to_csv('intermediate_files/articles_list.csv')
ARTICLE_TITLES = article_df['name'].to_list()
print(f'No. of wikipedia articles considered for analysis: {len(ARTICLE_TITLES)}')

No. of wikipedia articles considered for analysis: 1423


In [6]:
ACCESS_LIST = ['mobile-app', 'mobile-web', 'desktop', 'all-access']

In [7]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None, 
                                  access_type = None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title or not access_type: return None
    
    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(article_title.replace(' ','_'))
    request_template['article'] = article_title_encoded
    request_template['access'] = access_type
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    print(request_url)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [None]:
def fetch_data():
    mobile_response_list = []
    desktop_response_list = []
    cumu_response_list = []
    for article in ARTICLE_TITLES:
        try:
            mob_app_views = request_pageviews_per_article(article, 'mobile-app')['items']
            mob_web_views = request_pageviews_per_article(article, 'mobile-web')['items']
            desk_views = request_pageviews_per_article(article, 'desktop')['items']
            cumu_views = request_pageviews_per_article(article, 'all-access')['items']

            mob_views = []
            for app_view, web_view, desk_view, cumu_view in zip(mob_app_views, mob_web_views, desk_views, cumu_views):
                mob_view = app_view
                mob_view['views'] += web_view['views']

                del mob_view['access']
                mob_views.append(mob_view)
                del desk_view['access']
                del cumu_view['access']

            mobile_response_list += mob_views
            desktop_response_list += desk_views
            cumu_response_list += cumu_views

        except e:
            print(e)
            print(f'Data not found for: {article}')

    with open('dino_monthly_mobile_201507-202209.json', 'w') as f:
        json.dump(mobile_response_list, f)

    with open('dino_monthly_desktop_201507-202209.json', 'w') as f:
        json.dump(desktop_response_list, f)

    with open('dino_monthly_cumulative_201507-202209.json', 'w') as f:
        json.dump(cumu_response_list, f)
fetch_data()

https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/%22Coelosaurus%22_antiquus/monthly/2015070100/2022093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-web/user/%22Coelosaurus%22_antiquus/monthly/2015070100/2022093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/%22Coelosaurus%22_antiquus/monthly/2015070100/2022093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/%22Coelosaurus%22_antiquus/monthly/2015070100/2022093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Aachenosaurus/monthly/2015070100/2022093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-web/user/Aachenosaurus/monthly/2015070100/2022093000
https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/desktop/user/Aachenosaur

In [None]:
def read_json_to_df(file):
    df = pd.read_json(file, 
                      convert_dates=False,
                      dtype={'timestamp': pd.StringDtype})
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    return df

mobile_df = read_json_to_df('JSON_data/dino_monthly_mobile_201507-202209.json')
desktop_df = read_json_to_df('JSON_data/dino_monthly_desktop_201507-202209.json')
cumu_df = read_json_to_df('JSON_data/dino_monthly_cumulative_201507-202209.json')

mobile_df.to_csv('intermediate_files/dino_monthly_mobile_201507-202209.csv')
desktop_df.to_csv('intermediate_files/dino_monthly_desktop_201507-202209.csv')
cumu_df.to_csv('intermediate_files/dino_monthly_cumulative_201507-202209.csv')
mobile_df.dtypes

In [None]:
def get_max_min_avg_articles(df):
    df_grpd = df.groupby('article').mean('views')
    max_view_article = df_grpd.loc[df_grpd['views'].idxmax()].name
    min_view_article = df_grpd.loc[df_grpd['views'].idxmin()].name
    
    return df[df['article'] == max_view_article], df[df['article'] == min_view_article]

mob_max_df, mob_min_df = get_max_min_avg_articles(mobile_df)
desk_max_df, desk_min_df = get_max_min_avg_articles(desktop_df)

def plot_line(df, part_label):
    plt.plot(df['timestamp'],df['views'], label=part_label + df.iloc[0]['article'])

plt.figure(figsize=(12, 6), dpi=120)
plot_min_max(mob_max_df, 'mobile-max: ')
plot_min_max(mob_min_df, 'mobile-min: ')
plot_min_max(desk_max_df, 'desktop-max: ')
plot_min_max(desk_min_df, 'desktop-min: ')
plt.xlabel('Timestamp')
plt.ylabel('View Count')
plt.title('Article views for minimum and maximum average articles by mobile and desktop')
plt.legend(prop={'size': 10}, loc=(1, 0))
plt.savefig('figures/min_max_viewed_articles.png')

In [None]:
def get_top_10(df):
    df_grpd = df.groupby(['article'], as_index=False).max('views').sort_values(by=['views'], ascending=False).iloc[:10]
    return df_grpd['article'], df[df['article'].isin(df_grpd['article'].to_list())]

mobile_top10, mobile_top10_df = get_top_10(mobile_df)
desktop_top10, desktop_top10_df = get_top_10(desktop_df)

plt.figure(figsize=(12, 6), dpi=200)
for idx, article in enumerate(mobile_top10):
    plot_line(mobile_top10_df[mobile_top10_df['article'] == article], f'Mobile {idx+1} - ')
    

for idx, article in enumerate(desktop_top10):
    plot_line(desktop_top10_df[desktop_top10_df['article'] == article], f'Desktop {idx+1} - ')

plt.xlabel('Timestamp')
plt.ylabel('View Count')
plt.title('Top 10 Article by mobile and desktop')
plt.legend(prop={'size': 8}, loc=(1, 0))
plt.savefig('figures/top_10_most_viewed_articles.png')

In [None]:
def articles_by_least_view_months(df):
    df_grpd = df[['article', 'views']]\
                .groupby(['article'], as_index=False)\
                .count()\
                .sort_values(by=['views'])\
                .iloc[:10]\
                .rename(columns={'views': 'count'})
    
    return df_grpd['article'], df[df['article'].isin(df_grpd['article'].to_list())]

mobile_least, mobile_least_df = articles_by_least_view_months(mobile_df)
desktop_least, desktop_least_df = articles_by_least_view_months(desktop_df)

plt.figure(figsize=(12, 6), dpi=200)
for idx, article in enumerate(mobile_least):
    plot_line(mobile_least_df[mobile_least_df['article'] == article], f'Mobile {idx+1} - ')
    

for idx, article in enumerate(desktop_least):
    plot_line(desktop_least_df[desktop_least_df['article'] == article], f'Desktop {idx+1} - ')

plt.xlabel('Timestamp')
plt.ylabel('View Count')
plt.title('Top 10 Articles with least view months by mobile and desktop')
plt.legend(prop={'size': 8}, loc=(1, 0))
plt.savefig('figures/least_view_month_articles.png')