# Description:
In this notebook we use the Vec4IR framework to test some Semantic Retrieval settings. We implement the following document representations: tfidf, average of word2vec embeddings and doc2vec.

# TODO:
- Use pre-trained word-embeddings in Setting 2: https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html

In [None]:
from src.data.text_preprocessing import CorpusPreprocess
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation
from gensim.models import Word2Vec, Doc2Vec
from sklearn.model_selection import train_test_split
from vec4ir.doc2vec import Doc2VecInference
from vec4ir.core import Retrieval
from vec4ir.base import Tfidf, Matching
from vec4ir.word2vec import WordCentroidDistance

In [None]:
data_path = os.path.join("..", "data", "raw", "bbc")
models_path = os.path.join("..", "models", "saved_models")

# Reading files into memory
all_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(data_path)) for f in fn][1:]
corpus = []
for file in all_files:
    with open(file, 'r', encoding='latin') as f:
        corpus.append(f.read())

# Saving topics from each article
topics = [path.split("/")[4] for path in all_files]

# df = pd.read_csv(os.path.join(data_path, "sts-train.csv"))

In [None]:
# Train/ test split
train_corpus, test_corpus, train_topics, test_topics = train_test_split(corpus, topics, test_size=0.1, random_state=0)

# Preprocessing
prep = CorpusPreprocess(stop_words=stopwords.words('english'), lowercase=True, strip_accents=True,
                        strip_punctuation=punctuation, stemmer=PorterStemmer(), max_df=0.5, min_df=3)
processed_train_corpus = prep.fit_transform(train_corpus, tokenize=False)
processed_test_corpus = prep.transform(test_corpus, tokenize=False)

In [None]:
# Setting 1 - Default Matching | tfidf model | No query expansion
match_op = Matching()
tfidf = Tfidf()
retrieval = Retrieval(retrieval_model=tfidf, matching=match_op)
retrieval.fit(processed_train_corpus)

# Querying using the fitted Retrieval model
query = "American elections republicans"
idx = retrieval.query(prep.transform([query], tokenize=False)[0], k=3)  # return top 3 documents
results = [train_corpus[i] for i in idx.tolist()]
print("Most similar document to query: \"{}\"\n\n{}".format(query, results[0]))

In [None]:
# Setting 2 - Default Matching | WordCentroid model | No query expansion
match_op = Matching()
model = Word2Vec(processed_train_corpus, min_count=1)
wcd = WordCentroidDistance(model.wv)
retrieval = Retrieval(retrieval_model=wcd, matching=match_op)
retrieval.fit(processed_train_corpus)

# Querying using the fitted Retrieval model
query = "American elections republicans"
idx = retrieval.query(prep.transform([query], tokenize=False)[0], k=3)  # return top 3 documents
results = [train_corpus[i] for i in idx.tolist()]
print("Most similar document to query: \"{}\"\n\n{}".format(query, results[0]))

In [None]:
# Setting 3 - Default Matching | Doc2vec model | No query expansion
match_op = Matching()
model = Doc2Vec(vector_size=40, min_count=2, epochs=200)
model = Doc2Vec.load(os.path.join(models_path, "old_doc2vec_model.model"))  # loading pre-trained embeddings
doc2vec = Doc2VecInference(model=model, analyzer=lambda x: x.split())
retrieval = Retrieval(retrieval_model=doc2vec, matching=match_op)
retrieval.fit(processed_train_corpus)

# Querying using the fitted Retrieval model
query = "American elections republicans"
idx = retrieval.query(prep.transform([query], tokenize=False)[0], k=3)  # return top 3 documents
results = [train_corpus[i] for i in idx.tolist()]
print("Most similar document to query: \"{}\"\n\n{}".format(query, results[0]))