# Data Gathering for archive

#### Fetching data and creating folder structure where data is stored

In [None]:
# Import statements
import json, glob, os, re, requests, time
from datetime import date

In [None]:
# NYTimes developer console API key
nyt_archive_key = os.getenv("nyt_archive_key")
print("Using NYTimes API key: {}".format(nyt_archive_key))

# nyt_archive_key = os.getenv("nyt_key_2")
# print(nyt_archive_key)

In [None]:
# Two different APIs to fetch data from added in list
apis = ['archive']
data_folders_directory = []

In [None]:
# Functions Definitions:
## Function to create folder structure to store data for different APIs in JSON format for question 2
def create_directory_for_data(api, verbose=False):
    # Relative path for current directory.
    current_dir = os.path.dirname('__file__')                                
    data_folder = current_dir
    return create_subfolders_for_data(data_folder, 'data', 'collection', api, verbose) 

## Function to create subfolder as per the path specified and API names
def create_subfolders_for_data(data_folder, data, question, api, verbose=False):
    directory = os.path.join(data_folder, data, question, api)
    if not os.path.exists(directory):
        if verbose:
            print("create_subfolders_for_data() - creating directory: {}".format(directory))
            
        os.makedirs(directory)

    return directory
    
# Function to write data to JSON file at respective location
def write_to_json_file(file_path, json_data, verbose=False):
    if verbose:
        print("write_to_json_file() - deleting file and writing JSON data: {}".format(file_path))
        print("write_to_json_file() - JSON data size: {}".format(len(json_data)))
        
    with open(file_path, 'w') as json_out:
        json.dump(json_data, json_out, indent=2)

## Creates a list containing folder paths for both APIs
for api in apis:
    data_folder_dir = create_directory_for_data(api, verbose=True)
    data_folders_directory.append(data_folder_dir)
    
print(data_folders_directory)

In [None]:
# Function to get the last 6 months with year and month.
# Fetching data for 6 previous months 
def get_year_and_month_range_for_archives(verbose=False):    
    year_count = 0
    range_of_years = []
    mon = 0
    
    while (year_count < 2):
        row = ''
        
        # Get year in consideration
        year_to_consider = str(date.today().year - year_count)
        row = str(year_to_consider)
        month_count = date.today().month
        
        # If year is not current year, reset month count to 12 to trace Dec - Jan
        if (year_count > 0):
            month_count = 12
        
        # If month is before January, change the year
        while (month_count > 0):
            if (year_count == 1) and (mon > 5):         # Limiting the search for 6 months that's why 0-5
                break
            row = str(year_to_consider) + ',' + str(month_count)
            range_of_years.append(row)
            mon += 1
            month_count -= 1
        year_count += 1
        
    if verbose:
        print("get_year_and_month_range_for_archives() - returning: {}".format(range_of_years))
        
    return range_of_years

In [None]:
# Function to fetch response from API provided
## As both APIs have different calling parameters, using it to differentiate between the calls
def fetch_response_from_api(page_count, api, year, month, verbose=False):
    response = None
    if api == 'archive':
        # URL to hit
        url = 'https://api.nytimes.com/svc/' + api + '/v1/' + year + '/' + month + '.json'
        
        if verbose:
            print("fetch_response_from_api() - URL is {}".format(url))
            
        # Parameters to pass
        payload = {'api-key': nyt_archive_key}
        response = requests.get(url, params=payload)
    
    # Add time delay between 2 API calls to fetch response without interruption
    time.sleep(1)

    return response 

# Returns response object

In [None]:
# Function to extract only articles from the response file
## Also checking if the article is already present in file or not. 
## If article is present already, don't append it to list of articles.
def process_response_from_service(response, api, file_path, page_count, verbose=False):
    if response.status_code == 200:
        res = response.json()
                
        if verbose:
            print("process_response_from_service() - res['response']['docs'] size: {}".format(len(res['response']['docs'])))

        if os.path.exists(file_path):
            if verbose:
                print("process_response_from_service() - opening file: {}".format(file_path))
                
            with open(file_path) as fil:
                # Get already present file and its content as we will use this multiple times to gather data
                json_data = json.load(fil)
                
                if verbose:
                    print("process_response_from_service() - current data size: {}".format(len(json_data)))
                
                # Remove duplicates.
                unseen_articles = [artic for artic in res['response']['docs'] if artic['_id'] not in [articles['_id'] for articles in json_data]]
                
                if verbose:
                    print("process_response_from_service() - unseen articles size: {}".format(len(unseen_articles)))

                json_data.extend(unseen_articles)
        else:
            if verbose:
                print("process_response_from_service() - file {} doesn't exist".format(file_path))

            # If the file is not present, don't check for duplicates. Just write the articles into variable
            json_data = res['response']['docs']
        
        # Write output to JSON format.
        write_to_json_file(file_path, json_data, verbose=True)
    else:
        # Show error messages in case an API fails
        print("process_response_from_service() - request failed for '{}' with status code: {}"
              .format(api, response.status_code))

In [None]:
print(data_folders_directory)
for data_folder_dir in data_folders_directory:
    page_count = 0
    
    # Get the name of API from last component of the path
    api = os.path.split(data_folder_dir)[1]
    print('API to hit -> {}'.format(api))
    
    # Create the name of JSON file with folder path
    file_name = api + '_response_pages.json'
    file_path = os.path.join(data_folder_dir, file_name)

    print("file_path is: {}".format(file_path))

    if api == 'archive':
        # Get year and month range for past 6 months
        year_range_for_archive = get_year_and_month_range_for_archives(verbose=True)
        for time_to_consider in year_range_for_archive:
            # year and month is used but not page count
            year = time_to_consider[:4]
            month = time_to_consider[5:]
                
            # Fetch response for each year and month for past 6 months
            response = fetch_response_from_api(page_count, api, year, month, verbose=True)
                
            # save the articles only from response into JSON file 
            process_response_from_service(response, api, file_path, page_count, verbose=True)
            if response.status_code != 200:
                break

        if response.status_code == 200:
            page_count += 1

In [None]:
# Code to check how much data is present in each file.
#### No need to run. It will just print the number of articles in each response
for data_folder_dir in data_folders_directory:
    api = data_folder_dir[15:]
    print('API - ', api)
    
    file_name = api + '_response_pages.json'
    file_path = os.path.join(data_folder_dir, file_name)

    with open(file_path) as file_to_read:
        present = json.load(file_to_read)
        print('Data Count - ', len(present))

In [None]:
print(data_folders_directory)

