In [2]:
%matplotlib inline


# FreeDiscovery demo: review of GitHub issues.


An example to illustrate document review on github issues in `scikit-learn/scikit-learn`



In [82]:
from __future__ import print_function

import os.path
import requests
import pandas as pd

from IPython.display import HTML

pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.expand_frame_repr = False
pd.options.display.max_colwidth = -1

DATA_DIR = './data'
BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

def post(url, **pars):
    url = BASE_URL + url
    print(" POST", url)
    return requests.post(url, **pars).json()

def get(url, **pars):
    url = BASE_URL + url
    print(" POST", url)
    return requests.get(url, **pars).json()

db = pd.read_pickle('db.pkl')

def print_github(y):
    y = y.merge(db, on='document_id')
    y['name'] = ['<a href="{}"> {}</a>'.format(row['url'], row['title'])
                 for idx, row in y.iterrows()]
    for key in ['type', 'title', 'url']:
        del y[key]
    y.set_index('document_id', inplace=True)
    display(HTML(y.iloc[:10].to_html(escape=False)))

## 1. Feature extraction

In [97]:
print("\n1.a Load dataset and initalize feature extraction")
res = post('/feature-extraction')

dsid = res['id']
print("   => dsid = {}".format(dsid))

print("\n1.b Start feature extraction")

res = post('/feature-extraction/{}'.format(dsid),
           json={'data_dir': DATA_DIR,
                 'document_id_generator': 'infer_file_path'})

print("\n1.d. check the parameters of the extracted features")

res = get('/feature-extraction/{}'.format(dsid))

print('\n'.join(['     - {}: {}'.format(key, val)
      for key, val in res.items() if "filenames" not in key]))


1.a Load dataset and initalize feature extraction
 POST http://localhost:5001/api/v0/feature-extraction
   => dsid = a78456b80f3c4cf2

1.b Start feature extraction
 POST http://localhost:5001/api/v0/feature-extraction/a78456b80f3c4cf2

1.d. check the parameters of the extracted features
 POST http://localhost:5001/api/v0/feature-extraction/a78456b80f3c4cf2
     - analyzer: word
     - binary: False
     - chunk_size: 5000
     - data_dir: /home/rth/symerio/c/grossman_labs/IR-github-issues-demo/data
     - max_df: 1.0
     - min_df: 0.0
     - n_features: 100001
     - n_jobs: 1
     - n_samples: 9089
     - n_samples_processed: 9089
     - ngram_range: [1, 1]
     - norm: l2
     - parse_email_headers: False
     - stop_words: None
     - sublinear_tf: True
     - use_hashing: False
     - use_idf: False


### 3. Document categorization with LSI (used for Nearest Neighbors method)

In [17]:
url = BASE_URL + '/lsi/'
print("POST", url)

n_components = 150
res = requests.post(url,
                    json={'n_components': n_components,
                          'parent_id': dsid
                          }).json()

lsi_id = res['id']
print('  => LSI model id = {}'.format(lsi_id))
print('  => SVD decomposition with {} dimensions explaining {:.2f} % variabilty of the data'.format(
                        n_components, res['explained_variance']*100))



POST http://localhost:5001/api/v0/lsi/
  => LSI model id = 1af437fd719f4695
  => SVD decomposition with 150 dimensions explaining 59.67 % variabilty of the data


### 3. Semantic Search

In [83]:
res = post('/search', json={'parent_id': lsi_id,
                            'query_document_id': 8833,
                              })
y = pd.DataFrame(res['data'])
print_github(y)

 POST http://localhost:5001/api/v0/search


Unnamed: 0_level_0,score,name
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2008,0.629,[MRG+1] Elkan k-means
6029,0.604,Adding Time Series Regressors
5414,0.601,[MRG+1] Elkans K means
1593,0.588,Joblib saved classifier slow prediction
4917,0.587,LSHForest performance
7387,0.582,Remove training set prediction time from cross-validation timing?
7639,0.572,Timings in crossvalidation
8499,0.552,Memory leak in LogisticRegression
228,0.544,TfidfTransfomer: user-selectable norm
2844,0.535,time.clock() vs time.time()


### 4. Document categorization

In [96]:
print("\n3.a. Train the categorization model")

print(' Training...')

params = {'parent_id': lsi_id,
          'method': 'NearestNeighbor',
           'data': [{'document_id': 8833, 'category': 'positive'}, 
                    {'document_id': 7387, 'category': 'positive'}
                   ]}

res = post('/categorization/', json=params)
mid = res['id']


print("\n3.c Categorize the complete dataset with this model")
res = get('/categorization/{}/predict'.format(mid),
         )#json={'sort_by': 'positive'})

data = []
for row in res['data']:
    data.append({'document_id': row['document_id'],
                 'category': row['scores'][0]['category'],
                 'score': row['scores'][0]['score']})

y = pd.DataFrame(data)
print_github(y.iloc[:20])


3.a. Train the categorization model
 Training...
 POST http://localhost:5001/api/v0/categorization/

3.c Categorize the complete dataset with this model
 POST http://localhost:5001/api/v0/categorization/b7bddd48e33d44cf/predict


Unnamed: 0_level_0,category,score,name
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7639,positive,0.672,Timings in crossvalidation
2008,positive,0.653,[MRG+1] Elkan k-means
5414,positive,0.638,[MRG+1] Elkans K means
228,positive,0.633,TfidfTransfomer: user-selectable norm
6029,positive,0.604,Adding Time Series Regressors
2844,positive,0.595,time.clock() vs time.time()
1593,positive,0.588,Joblib saved classifier slow prediction
4917,positive,0.587,LSHForest performance
2078,positive,0.585,Refactor euclidean distance metric in Cython
6899,positive,0.583,[MRG] Expose mean_time in results_ for *SearchCV
