In [96]:
import os
import spacy
import pandas as pd
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis.acore import Composable
from whoosh.analysis import IDAnalyzer, LowercaseFilter, StopFilter

path = "../financial-corpus"

In [97]:
schema = Schema(title=TEXT(stored=True), 
                filename=ID(stored=True),
                date=DATETIME(stored=True),
                agency=TEXT(stored=True, analyzer=IDAnalyzer()),
                author=TEXT(stored=True, analyzer=IDAnalyzer()),
                content=TEXT)
ix = create_in(path + "/index", schema)


In [117]:
class SpacyTokenizer(Composable):
    def __init__(self):
        self.tokenizer = spacy.load('en_core_web_sm')
        
    def __call__(self, value):
        tokens = self.tokenizer(value)
        for token in tokens:
            yield token.text  
    
analyzer = SpacyTokenizer() | LowercaseFilter() | StopFilter()
writer = ix.writer()

In [99]:
files = list(filter(lambda x: x[-4:] == ".txt", os.listdir(path + "/data")))

In [118]:
from tqdm import tqdm_notebook as tqdm

for filename in tqdm(files):
    try:
        with open(os.path.join(path, "data", filename), encoding="utf-8") as file:
            text = file.read()

        # csv не смог в экранированные запятые, пришлось применять тяжелую артиллерию
        csv_data = pd.read_csv(os.path.join(path, "data", filename[:-4] + '.csv'))
        csv_data = csv_data.where((pd.notnull(csv_data)), None)

        writer.add_document(title=csv_data["title"][0], filename=filename, 
                            date=pd.to_datetime(csv_data["timestamp"][0]),
                            agency=csv_data["agency"][0], author=csv_data["author"][0], content=text) 
    except Exception as ex:
        print(filename)
        print(ex)


In [119]:
writer.commit()