In [1]:
%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'


Categorization Example [REST API]
---------------------------------

An example to illustrate binary categorizaiton with FreeDiscovery



In [2]:
from __future__ import print_function

from time import time, sleep
import os.path
from multiprocessing import Process
import requests
import pandas as pd

pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.expand_frame_repr = False

DATA_DIR = './data'
BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL


In [15]:

# 1. Feature extraction

print("\n1.a Load dataset and initalize feature extraction")
url = BASE_URL + '/feature-extraction'
print(" POST", url)
res = requests.post(url).json()

dsid = res['id']
print("   => received {}".format(list(res.keys())))
print("   => dsid = {}".format(dsid))

print("\n1.b Start feature extraction")

url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url, json={'data_dir': DATA_DIR,
                               'document_id_generator': 'infer_file_path'})

print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url).json()

print('\n'.join(['     - {}: {}'.format(key, val)
      for key, val in res.items() if "filenames" not in key]))


1.a Load dataset and initalize feature extraction
 POST http://localhost:5001/api/v0/feature-extraction
   => received ['id']
   => dsid = 966d2314a5d04a0f

1.b Start feature extraction
 POST http://localhost:5001/api/v0/feature-extraction/966d2314a5d04a0f

1.d. check the parameters of the extracted features
 GET http://localhost:5001/api/v0/feature-extraction/966d2314a5d04a0f
     - analyzer: word
     - binary: False
     - chunk_size: 5000
     - data_dir: /home/rth/symerio/c/grossman_labs/IR-github-issues-demo/data
     - max_df: 1.0
     - min_df: 0.0
     - n_features: 100001
     - n_jobs: 1
     - n_samples: 9089
     - n_samples_processed: 9089
     - ngram_range: [1, 1]
     - norm: l2
     - parse_email_headers: False
     - stop_words: None
     - sublinear_tf: True
     - use_hashing: False
     - use_idf: False


In [20]:
url = BASE_URL + '/feature-extraction/{}/id-mapping'.format(dsid)
print(' GET', url)
pd.DataFrame(requests.post(url).json()['data'])

 GET http://localhost:5001/api/v0/feature-extraction/966d2314a5d04a0f/id-mapping


Unnamed: 0,document_id,file_path,internal_id
0,1,00001,0
1,2,00002,1
2,3,00003,2
3,4,00004,3
4,5,00005,4
5,6,00006,5
6,7,00007,6
7,8,00008,7
8,9,00009,8
9,10,00010,9


### 3. Document categorization with LSI (used for Nearest Neighbors method)

In [16]:
url = BASE_URL + '/lsi/'
print("POST", url)

n_components = 150
res = requests.post(url,
                    json={'n_components': n_components,
                          'parent_id': dsid
                          }).json()

lsi_id = res['id']
print('  => LSI model id = {}'.format(lsi_id))
print('  => SVD decomposition with {} dimensions explaining {:.2f} % variabilty of the data'.format(
                        n_components, res['explained_variance']*100))



POST http://localhost:5001/api/v0/lsi/
  => LSI model id = 776ae1633f4e44b5
  => SVD decomposition with 150 dimensions explaining 59.65 % variabilty of the data


### 3. Semantic Search

In [25]:
url = BASE_URL + '/search/'
print(" POST", url)
print(' Training...')

res = requests.post(url,
                        json={'parent_id': lsi_id,
                              'query_document_id': 8833,
                              }).json()
print(pd.DataFrame(res['data']))

 POST http://localhost:5001/api/v0/search/
 Training...
      document_id  score
0            2008  0.628
1            6029  0.622
2            5414  0.600
3            4917  0.588
4            7387  0.582
5            1593  0.581
6            7639  0.562
7             228  0.558
8            8499  0.535
9            2078  0.533
10           2844  0.524
11           5270  0.499
12           6351  0.497
13           7458  0.487
14           4655  0.484
15           6899  0.483
16           8918  0.466
17           2740  0.449
18           4604  0.445
19           6322  0.444
20           3486  0.444
21           7408  0.440
22           3448  0.432
23           4603  0.428
24           4588  0.428
25           3304  0.412
26           1685  0.403
27           2044  0.403
28           3007  0.396
29           3079  0.388
...           ...    ...
9058         8897 -0.170
9059         1098 -0.170
9060         6807 -0.171
9061         7880 -0.171
9062         6587 -0.173
9063         5464 -

In [24]:
#### print("\n3.a. Train the categorization model")



url = BASE_URL + '/categorization/'
print(" POST", url)
print(' Training...')

res = requests.post(url,
                        json={'parent_id': lsi_id,
                              'data': input_ds['training_set'],
                              'method': method,  # one of "LinearSVC", "LogisticRegression", 'xgboost'
                              'training_scores': True
                              }).json()

mid = res['id']
print("     => model id = {}".format(mid))
print('    => Training scores: MAP = {average_precision:.3f}, ROC-AUC = {roc_auc:.3f}, recall @20%: {recall_at_20p:.3f} '.format(**res['training_scores']))

print("\n3.b. Check the parameters used in the categorization model")
url = BASE_URL + '/categorization/{}'.format(mid)
print(" GET", url)
res = requests.get(url).json()

print('\n'.join(['     - {}: {}'.format(key, val)
          for key, val in res.items() if key not in ['index', 'category']]))

print("\n3.c Categorize the complete dataset with this model")
url = BASE_URL + '/categorization/{}/predict'.format(mid)
print(" GET", url)
res = requests.get(url, json={'subset': 'test'}).json()

data = []
for row in res['data']:
    nrow = {'document_id': row['document_id'],
                'category': row['scores'][0]['category'],
                'score': row['scores'][0]['score']}
    if method == 'NearestNeighbor':
        nrow['nearest_document_id'] = row['scores'][0]['document_id']
    data.append(nrow)

df = pd.DataFrame(data).set_index('document_id')
print(df)

 POST http://localhost:5001/api/v0/categorization/
 Training...


NameError: name 'input_ds' is not defined