# Language

## Preprocessing & Corpus Stats

In [None]:
import pandas as pd
import re
import collections
import csv

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
def preprocess(self, doc):

    stop_words = stopwords.words('english')
    content = re.sub('[^A-Za-z]+', ' ', doc)
    content = content.lower().split()
    content = ' '.join([word for word in content if word not in stop_words and len(word) > 1])

    return content

In [None]:
def get_unique_tokens(self, file):
    input_file = open(file, "r", encoding="utf8")
    all_words = list()
    vocab_tokens = list()

    for line in input_file:
        line.rstrip()
        words = line.split()
        all_words.extend(words)

    for word in allWords:
        word = re.sub(r'\b[^\W\d_]+\b', '', word)
        word = word.lower().strip()
        if not word.isdigit():
            if word not in vocab_tokens:
                vocab_tokens.append(word)

    return vocab_tokens

In [None]:
def get_most_common(self, content):
    content = ' '.join(content)
    lines = content.lower().splitlines()

    prep = [preprocess_document(x) for x in lines]

    words = [x for sublist in prep for x in sublist]

    count = Counter(words)
    
    return count.most_common(20)

In [None]:
def print_corpus_statistics(self, file, most_common, preprocessed= False):
    tokens_occuring_once = list()
    tokens_occuring_1000_plus = list()
    # ref. http://stackoverflow.com/questions/25985299/create-python-dictionary-from-text-file-and-retrieve-count-of-each-word
    with open(file, "r", encoding="utf8") as f:
        c = collections.Counter(
            word.lower()
            for line in f
            for word in re.findall(r'\b[^\W\d_]+\b', line))
    collection_len = len(c)

    if preprocessed:
        vocab_size = len(self.get_unique_tokens_preprocessed(file))
    else:
        vocab_size = len(self.get_unique_tokens(file))

    print ("Total word occurences: %d" % sum(c.values()))
    print ("Vocabulary size: %d" % vocab_size)

    print ('Most common words:')
    for letter, count in c.most_common(most_common):
        print ('%s: %7d' % (letter, count))

    for letter, count in c.most_common(collection_len):
        if count > 1000:
            tokens_occuring_1000_plus.append(letter)
        if count == 1:
            tokens_occuring_once.append(letter)

    print ("There are %d words occuring > 1000 times" % len(tokens_occuring_1000_plus))
    print ("There are %d words occuring once" % len(tokens_occuring_once))

    return tokens_occuring_1000_plus, tokens_occuring_once

In [None]:
def get_preprocessed_docs(self, file, keyword, keep_rare_words = True):
    data = self.parse_docs(file.readlines(), keyword, keep_rare_words)
    return data

def parse_docs(self, lines, keyword, keep_rare_words):
    data = []
    for line in lines:
        docid, line = line.split('\t', 1)
        url, text = line.split('\t', 1)

        if keyword in docid:
            docid = docid.replace('\t', '')
            line = self.preprocess_line(text, keep_rare_words)
            data.append((docid, line))
    return data

def preprocess_line(self, line, keep_rare_words = True, use_stemming = True):
    line = self.remove_punctuation(line)
    words = line.split()
    words_to_keep = []
    for word in words:
        word = word.lower().strip()
        if word not in self.stop_words:
            if not self.has_more_digits(word):
                if use_stemming:
                   word = self.ps.stem(word)
                if keep_rare_words:
                    words_to_keep.append(word)
                else:
                    if word not in self.rare_words:
                        words_to_keep.append(word)

        new_line = ' '.join(words_to_keep)

    return self.clean_digits(new_line)

In [None]:
def clean_digits(self, line):
    words_to_keep = []
    words = line.split()
    for word in words:
        count_digits = 0
        count_chars = 0
        for w in word:
            if w.isdigit():
                count_digits = count_digits + 1
            elif w == '-':
                count_digits = count_digits + 1
            elif w == '/':
                count_digits = count_digits + 1
            else:
                count_chars = count_chars + 1
        if count_chars > count_digits:
            words_to_keep.append(word)
    return ' '.join(words_to_keep)


def remove_punctuation(self, txt):
    txt = txt.replace('-', ' ')
    txt = txt.replace('/', ' ')
    return re.sub('[^A-Za-z0-9\s]+', '', txt)

def has_more_digits(self, txt):
    count = 0
    for ch in txt:
        if ch.isdigit():
            count = count + 1
    if count > 1:
        return True
    else:
        return False

# Gensim

In [None]:
import sys
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import nltk
import re
from tqdm import tqdm
import pandas as pd
from gensim.models import Word2Vec
import gensim
from sklearn.decomposition import PCA
from matplotlib import pyplot

import gensim
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import numpy
from random import shuffle

import timeit
from sklearn.cluster import KMeans
from sklearn import metrics

import timeit
from collections import defaultdict
import pickle

### Word2Vec

In [None]:
def parse_sentence_words(input_file_names):
    sentence_words=[]
    for file_name in input_file_names:
        print(file_name)
        for line in open(file_name, encoding="utf8"):
            line=line.strip().lower()
            line=get_words(line)
            sent_words=tokenize(line)
            if len(sent_words) >1:
                sentence_words.append(sent_words)
    return sentence_words

def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?',sent) if x.strip()]

def get_words(line):
        line = ' '.join([word for word in line.split() if word not in cachedStopWords])
        line = ' '.join([re.sub(r'[^\w\s]','',word) for word in line.split() if word not in cachedStopWords])
        line = ' '.join([word for word in line.split() if not word.isdigit()])
        return line

In [None]:
sentences = parse_sentence_words(input_file_names) # should be an array of arrays, where each inner array is the sentence tokens

In [None]:
model = gensim.models.Word2Vec(
        sentences,
        size=150,
        window=10,
        min_count=2,
        workers=10)

In [None]:
model.train(sentences, total_examples=len(sentences), epochs=10)

In [None]:
model.wv.most_similar(positive='search')

In [None]:
model.save('model_word2vec.bin')

### Doc2Vec

In [None]:
class Doc2VecTrainer():

    def __init__(self, file_to_data):
        self.df_corpus = pd.read_csv(file_to_data, delimiter='\t')
        self.load_data()

    def load_data(self):
        self.docLabels = []
        self.data = []

        for index, row in self.df_corpus.iterrows():
            self.docLabels.append(row['id'])
            prep_doc = row['text']
            self.data.append(prep_doc)

    def init_model(self, vec_size, window):
        self.it = self.LabeledLineSentence(self.data, self.docLabels)
        self.model = gensim.models.Doc2Vec(vector_size=vec_size, window=window, min_count=5, workers=mp.cpu_count(), alpha=0.025,
                                      min_alpha=0.025)  # use fixed learning rate
        self.model.build_vocab(self.it.to_array())

    def train_model(self, number_epochs):
        self.model.train(self.it, epochs=number_epochs, total_examples=self.model.corpus_count)
        self.model.save('./doc2vec_model.d2v')
        return self.model

    class LabeledLineSentence(object):
        def __init__(self, doc_list, labels_list):
            self.labels_list = labels_list
            self.doc_list = doc_list

        def __iter__(self):
            for idx, doc in enumerate(self.doc_list):
                yield TaggedDocument(doc.split(), [self.labels_list[idx]])

        def to_array(self):
            self.sentences = []
            for idx, doc in enumerate(self.doc_list):
                self.sentences.append(TaggedDocument(doc.split(), [self.labels_list[idx]]))
            return self.sentences

        def sentences_perm(self):
            shuffle(self.sentences)
            return self.sentences

In [None]:
class Doc2VecSearch():

    def __init__(self, path_to_model, file_to_dict):
        self.model = Doc2Vec.load(path_to_model)
        #with open(file_to_dict, 'rb') as handle:
         #   self.docs_dict = pickle.load(handle)

    def get_most_similar_terms(self, term, k):
        return self.model.wv.most_similar(term, topn=k)

    def get_results_as_df(self, sim_docs):
        results_df = pd.DataFrame(columns=['doc_id', 'confidence', 'userurl', 'keywords'])

        for doc in sim_docs:
            doc_id = doc[0]
            entry = self.docs_dict.get(doc_id)[0]
            results_df = results_df.append(
                {'doc_id': doc_id, 'confidence': doc[1], 'userurl': entry[0], 'keywords': entry[1]}, ignore_index=True)
        return results_df

    def get_results_as_array(self, sim_docs):
        results = []
        for doc in sim_docs:
            doc_id = doc[0]
            entry = self.docs_dict.get(doc_id)[0]
            results.append((doc_id, doc[1], entry[0], entry[1]))
        return results

    def search(self, query, k):
        query_vec = self.model.infer_vector(query.split())
        sim_docs = self.model.docvecs.most_similar([query_vec], topn=k)
        return sim_docs

In [None]:
# Clustering, ref. https://www.kaggle.com/sgunjan05/document-clustering-using-doc2vec-word2vec/code
kmeans_model = KMeans(n_clusters=4, init='k-means++', max_iter=100)  
X = kmeans_model.fit(saved_model.docvecs.vectors_docs)
labels=kmeans_model.labels_.tolist()

In [None]:
l = kmeans_model.fit_predict(saved_model.docvecs.vectors_docs)
pca = PCA(n_components=2).fit(saved_model.docvecs.vectors_docs)
datapoint = pca.transform(saved_model.docvecs.vectors_docs)

In [None]:
centroids = kmeans_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s = 150, c='#000000')
plt.show()

In [None]:
def assign_to_cluster(labels, docs):
    topics_clusters = defaultdict(list)
    
    if(len(labels)!= len(docs)):
        print ('Number of labels must be equal to the number of documents.')
        
    else:    
        for i in range(len(labels)):
            topics_clusters[labels[i]].append(docs[i])
    
    return topics_clusters 

In [None]:
def get_clusters_distribution(clusters):
    total = 0
    for key, value in clusters.items():
        c_docs_count = len(clusters[key])
        print('Cluster ', str(key), ' ', c_docs_count)
        total += c_docs_count
    print ('Total docs count ', total)

In [None]:
clusters_dict = assign_to_cluster(labels, data_prep)

In [None]:
def get_most_frequen_words(clusters_dict)
    for key, value in clusters_dict.items():
        print('Cluster '+ str(key))
        keywords = get_most_common(value)
        print(keywords)

# Ranking Results Evaluation

In [None]:
class Evaluation:

    def __init__(self, path_to_gs):
        self.gold_standard_dict = pickle.load(open(path_to_gs, "rb"))


    def precision_at_k_evaluation(self, query_id, predicted_docs, k):
        top_k_docs = list()

        for doc_id in range(k[0]):
            top_k_docs.append(predicted_docs[doc_id])

        measure = self.precision_evaluation(query_id, top_k_docs)

        return measure

    def ape_evaluation(self, query_id, predicted_docs):
        sum_precisions = 0
        count_correct_predictions = 0
        
        for doc_id in predicted_docs:
            if doc_id in self.gold_standard_dict[query_id]:
                count_correct_predictions += 1
                rank = np.where(predicted_docs == doc_id)[0] + 1 
                sumPrecisions += count_correct_predictions / rank[0] 
                
        if count_correct_predictions == 0:
            measure = 0
        else:
            measure = sum_precisions / count_correct_predictions
        return measure

    '''nDCG idea: input is the ranked docs for the query:
     - for each doc, lookup whether it is in the assigned as releveant docs to this query, 
     - if yes- take its relevancy score, compute the discount factor based on the rank'''

    def compute_dcg_one_query(self, query_id, predicted_docs, best=False):
        i = 1
        dcg_accumulated = 0
        if not best:
            for doc_id in predicted_docs:
                disc_factor = 1 / np.log2(max(i, 2))
                relevancy = self.get_true_relevancy(query_id, doc_id)
                gain = np.multiply(disc_factor, relevancy)
                dcg_accumulated = dcg_accumulated + gain
                # print(dcg_accumulated)
                i = i + 1
        else:
            for doc_id, score in predicted_docs:
                disc_factor = 1 / np.log2(max(i, 2))
                relevancy = float(score)
                gain = np.multiply(disc_factor, relevancy)
                dcg_accumulated = dcg_accumulated + gain
                # print(dcg_accumulated)
                i = i + 1

        return float("{0:.2f}".format(dcg_accumulated))

    def get_true_relevancy(self, query_id, doc_id):
        try:
            score = self.gold_standard_dict[query_id][doc_id]
        except KeyError as err:
            score = 0
        return float(score)

    def compute_best_dcg_one_query(self, query_id):

        true_docs = self.gold_standard_dict[query_id]
        true_docs_best = sorted(true_docs.items(), key = operator.itemgetter(1), reverse = True)
        best_dcg = self.compute_dcg_one_query(query_id, true_docs_best, True)

        return best_dcg

    def ndcg_evaluation(self, query_id, predicted_docs):
        dcg = self.compute_dcg_one_query(query_id, predicted_docs)
        best_dcg = self.compute_best_dcg_one_query(query_id)

        if best_dcg == 0:
            best_dcg = 1

        return dcg / best_dcg

    def evaluate_map_and_ndcg(self, queries, search, nr_docs):
        average_precision_GLOBAL = 0
        count=0
        ndcg_GLOBAL = 0
        time_GLOBAL = 0
   
        for index, row in queries.iterrows():
            q_id= row['id'] # id to lookup in the gold standard
            query_text= row['query'] # query to test the retrieval model against
            start_time=time.time()
            predicted_docs= search.retrieve(query_text, nr_docs)
            elapsed_time= time.time()-start_time
            time_GLOBAL += elapsed_time
            average_precision_GLOBAL += self.ape_evaluation(q_id, predicted_docs)
            ndcg_GLOBAL += self.ndcg_evaluation(q_id, predicted_docs)
            count+=1
            if count % 500 == 0:
                print('Processed doc: ', count)  
        mean_ap = average_precision_GLOBAL/count
        mean_ndcg = ndcg_GLOBAL/count
        avg_time = time_GLOBAL/count
    
        print("Mean Average Precision = {}".format(mean_ap))
        print("NDCG = {}".format(mean_ndcg ))
        print("Time total= {}".format(time_GLOBAL))
        print("Time AVG= {}".format(avg_time))
    
        return mean_ap, mean_ndcg, elapsed_time