In [61]:
from elasticsearch.helpers import bulk as es_bulk
from elasticsearch import Elasticsearch
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
DATA_HOME = '/Users/sahil/scikit_learn_data/'

In [69]:
es = Elasticsearch()

# 20 News Groups

In [70]:
ngdata = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), data_home=DATA_HOME)

In [71]:
def ng_tuples(from_index, to_index):
    for i in range(from_index, to_index):
        yield {
            'create': {
                '_index': '20ng', 
                '_type': 'document',
                '_id': i
            }
        }
        
        yield {
            'category': ngdata.target_names[ngdata.target[i]],
            'body': ngdata.data[i]
        }


In [72]:
n_documents = len(ngdata.data)
res = []
for i in range(0, n_documents, 1000):
    res.append(es.bulk(body=ng_tuples(i, min(i+1000, n_documents))))
res

[{'took': 1182,
  'errors': False,
  'items': [{'create': {'_index': '20ng',
     '_type': 'document',
     '_id': '0',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_term': 1,
     'status': 201}},
   {'create': {'_index': '20ng',
     '_type': 'document',
     '_id': '1',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_term': 1,
     'status': 201}},
   {'create': {'_index': '20ng',
     '_type': 'document',
     '_id': '2',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_term': 1,
     'status': 201}},
   {'create': {'_index': '20ng',
     '_type': 'document',
     '_id': '3',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_prim

# DUC dataset

In [146]:
import os

DUC_DIR = DATA_HOME+'Modified-DUC2001/'

def extract_text(data):
    start = data.find('[TEXT]')
    end = data.find('[/TEXT]')
    if start == -1:
        start = data.find('<TEXT>')
    if end == -1:
        end = data.find('</TEXT>')
    if start == -1:
        start = 0
    else:
        start += 6
    if end == -1:
        end = len(data)
    
    return data[start: end].strip()

def extract_summary(data):
    return data.replace('Abstract:', '').replace('Introduction:', '').strip()

duc_data = []    
for i, filename in enumerate(os.listdir(DUC_DIR)):
    if filename == '.DS_Store' or os.path.isdir(DUC_DIR+filename):
        continue

    body = ''
    summary = ''

    with open(DUC_DIR+filename, 'r', encoding='utf-8', errors='ignore') as file:
        body = extract_text(file.read())

    summary_filename = DUC_DIR+'Summaries/'+filename.lower()+'.txt'
    if os.path.isfile(summary_filename):
        with open(summary_filename, 'r', encoding='utf-8', errors='ignore') as summary_file:
            summary = extract_summary(summary_file.read())

    duc_data.append({
        'summary': summary,
        'body': body
    })

In [147]:
def duc_tuples(from_index, to_index):
    for i in range(from_index, to_index):
        yield {
            'create': {
                '_index': 'duc', 
                '_type': 'document',
                '_id': i
            }
        }
        
        yield duc_data[i]


In [148]:
duc_res = []
for i in range(0, len(duc_data), 1000):
    duc_res.append(es.bulk(body=duc_tuples(i, min(i+1000, len(duc_data)))))
duc_res

[{'took': 503,
  'errors': False,
  'items': [{'create': {'_index': 'duc',
     '_type': 'document',
     '_id': '0',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_term': 1,
     'status': 201}},
   {'create': {'_index': 'duc',
     '_type': 'document',
     '_id': '1',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_term': 1,
     'status': 201}},
   {'create': {'_index': 'duc',
     '_type': 'document',
     '_id': '2',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_term': 1,
     'status': 201}},
   {'create': {'_index': 'duc',
     '_type': 'document',
     '_id': '3',
     '_version': 1,
     'result': 'created',
     '_shards': {'total': 2, 'successful': 1, 'failed': 0},
     '_seq_no': 0,
     '_primary_t