In [None]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta
from google.colab import drive

In [None]:
pwd


'/content'

# New Section

In [None]:
drive.mount('/drive')

Mounted at /drive


In [None]:
end = datetime.date.today()
start = end - relativedelta(years=5)
print(end, start)

2020-12-26 2015-12-26


In [None]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]
print(months_in_range)

[['2016', '1'], ['2016', '2'], ['2016', '3'], ['2016', '4'], ['2016', '5'], ['2016', '6'], ['2016', '7'], ['2016', '8'], ['2016', '9'], ['2016', '10'], ['2016', '11'], ['2016', '12'], ['2017', '1'], ['2017', '2'], ['2017', '3'], ['2017', '4'], ['2017', '5'], ['2017', '6'], ['2017', '7'], ['2017', '8'], ['2017', '9'], ['2017', '10'], ['2017', '11'], ['2017', '12'], ['2018', '1'], ['2018', '2'], ['2018', '3'], ['2018', '4'], ['2018', '5'], ['2018', '6'], ['2018', '7'], ['2018', '8'], ['2018', '9'], ['2018', '10'], ['2018', '11'], ['2018', '12'], ['2019', '1'], ['2019', '2'], ['2019', '3'], ['2019', '4'], ['2019', '5'], ['2019', '6'], ['2019', '7'], ['2019', '8'], ['2019', '9'], ['2019', '10'], ['2019', '11'], ['2019', '12'], ['2020', '1'], ['2020', '2'], ['2020', '3'], ['2020', '4'], ['2020', '5'], ['2020', '6'], ['2020', '7'], ['2020', '8'], ['2020', '9'], ['2020', '10'], ['2020', '11'], ['2020', '12']]


In [None]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url  + date[0] + '/' + date[1] + '.json?api-key=' + 'WNG5DC1p1Eoz95qXkk8I0bDaBjeWvANn'
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 

    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        # print(article['section_name'])
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section_name' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))


In [None]:
get_data(months_in_range)

Date range: ['2016', '1'] to ['2020', '12']
Saving headlines/2016-1.csv...
Saving headlines/2016-2.csv...
Saving headlines/2016-3.csv...
Saving headlines/2016-4.csv...
Saving headlines/2016-5.csv...
Saving headlines/2016-6.csv...
Saving headlines/2016-7.csv...
Saving headlines/2016-8.csv...
Saving headlines/2016-9.csv...
Saving headlines/2016-10.csv...
Saving headlines/2016-11.csv...
Saving headlines/2016-12.csv...
Saving headlines/2017-1.csv...
Saving headlines/2017-2.csv...
Saving headlines/2017-3.csv...
Saving headlines/2017-4.csv...
Saving headlines/2017-5.csv...
Saving headlines/2017-6.csv...
Saving headlines/2017-7.csv...
Saving headlines/2017-8.csv...
Saving headlines/2017-9.csv...
Saving headlines/2017-10.csv...
Saving headlines/2017-11.csv...
Saving headlines/2017-12.csv...
Saving headlines/2018-1.csv...
Saving headlines/2018-2.csv...
Saving headlines/2018-3.csv...
Saving headlines/2018-4.csv...
Saving headlines/2018-5.csv...
Saving headlines/2018-6.csv...
Saving headlines/201

ModuleNotFoundError: ignored

In [None]:
pip install explainerdashboard


Collecting explainerdashboard
[?25l  Downloading https://files.pythonhosted.org/packages/f8/43/9c9efd4660486dd9fa685e178c5664504c93e2a83e8dc8c11289d545e640/explainerdashboard-0.2.18.1-py3-none-any.whl (196kB)
[K     |████████████████████████████████| 204kB 11.4MB/s 
[?25hCollecting waitress
[?25l  Downloading https://files.pythonhosted.org/packages/26/d1/5209fb8c764497a592363c47054436a515b47b8c3e4970ddd7184f088857/waitress-1.4.4-py2.py3-none-any.whl (58kB)
[K     |████████████████████████████████| 61kB 5.0MB/s 
[?25hCollecting dash-bootstrap-components
[?25l  Downloading https://files.pythonhosted.org/packages/1d/26/17f8b18a79bae9e278b7674ad0aef8007e87b4d2280525a95103fd2a8fe5/dash_bootstrap_components-0.11.1-py2.py3-none-any.whl (187kB)
[K     |████████████████████████████████| 194kB 18.9MB/s 
Collecting dash
[?25l  Downloading https://files.pythonhosted.org/packages/dd/17/55244363969638edd1151de0ea4aa10e6a7849b42d7d0994e3082514e19d/dash-1.18.1.tar.gz (74kB)
[K     |█████████

In [None]:
pip install explainerdashboard

from sklearn.ensemble import RandomForestClassifier
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from explainerdashboard.datasets import titanic_survive, feature_descriptions

X_train, y_train, X_test, y_test = titanic_survive()
model = RandomForestClassifier(n_estimators=50, max_depth=10).fit(X_train, y_train)
explainer = ClassifierExplainer(model, X_test, y_test, cats=['Sex', 'Deck', 'Embarked'], descriptions=feature_descriptions, labels=['Not survived', 'Survived'])
ExplainerDashboard(explainer).run()

Note: shap=='guess' so guessing for RandomForestClassifier shap='tree'...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)
Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Building ExplainerDashboard..
Detected google colab environment, setting mode='external'
Generating layout...
Calculating shap values...
Calculating dependencies...
Calculating categorical permutation importances (if slow, try setting n_jobs parameter)...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating prediction probabilities...
Calculating pred_percentiles...
Calculating shap interaction values...
Reminder: TreeShap computational complexity is O(TLD^2), where T is the number of trees, L is the maximum number of leaves in any tree and D the maximal depth of any tree. So r

<IPython.core.display.Javascript object>

In [None]:
print(X_train.describe)

<bound method NDFrame.describe of                                              Fare  ...  Embarked_Unknown
Passenger                                          ...                  
Braund, Mr. Owen Harris                    7.2500  ...                 0
Heikkinen, Miss. Laina                     7.9250  ...                 0
Allen, Mr. William Henry                   8.0500  ...                 0
Moran, Mr. James                           8.4583  ...                 0
McCarthy, Mr. Timothy J                   51.8625  ...                 0
...                                           ...  ...               ...
Rice, Mrs. William (Margaret Norton)      29.1250  ...                 0
Graham, Miss. Margaret Edith              30.0000  ...                 0
Johnston, Miss. Catherine Helen "Carrie"  23.4500  ...                 0
Behr, Mr. Karl Howell                     30.0000  ...                 0
Dooley, Mr. Patrick                        7.7500  ...                 0

[691 rows x 21 c