# Textual Document Vectors

In [1]:
import gensim as gs
import os
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

## Get all the files in a directory

In [2]:
def get_files(directory):
    filenames=[]
    for root, dirs, files in os.walk(directory, topdown=False):
        for name in files:
            filenames.append(os.path.join(root,name))
    return filenames

## Process the File

In [3]:
def process_file(filename):
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        f=open(filename,"r")
        temp=f.read().decode('unicode_escape').encode('ascii','ignore')
        f.close()
        intermediate = tokenizer.tokenize(temp)
        stop = stopwords.words('english')
        misc=['i','and','the', 'a', 'an']
        word_list = [i.lower() for i in intermediate if 
                     ((i not in stop) and (i not in misc))]
        return word_list
    except:
        return

## Load the documents into a suitable format

In [4]:
def load_documents(directory):
    filenames=get_files(directory)
    doc=[]
    for filename in filenames:
        word_list=process_file(filename)
        if(word_list!=None):
            doc.append(gs.models.doc2vec.TaggedDocument(
                    words=word_list,tags=[filename]))  
        else:
            print(filename+" failed to load")
    return doc

## Train the model basis these documents

In [5]:
def train_model(directory):
    doc=load_documents(directory)
    model=gs.models.doc2vec.Doc2Vec(doc, size=100, window=8, 
                                    min_count=5, workers=4)
    return model

In [6]:
model=train_model("./masc_500k_texts")

./masc_500k_texts/written/email/174124.txt~ failed to load
./masc_500k_texts/written/email/175841.txt~ failed to load
./masc_500k_texts/written/email/173906.txt~ failed to load
./masc_500k_texts/written/email/175816.txt~ failed to load
./masc_500k_texts/written/email/234267.txt~ failed to load
./masc_500k_texts/written/email/175448.txt~ failed to load
./masc_500k_texts/written/email/173252.txt~ failed to load
./masc_500k_texts/written/email/176581.txt~ failed to load


## Get the most similar documents in the corpus trained on

In [7]:
def most_similar_docs(filename):
    print("Name:%s\n\nNearest documents:\n"%(filename))
    for i in model.docvecs.most_similar(filename):
        print("Name:%s\nSimilarity:%s\n"%(i[0],i[1]))

In [8]:
most_similar_docs("./masc_500k_texts/written/newspaper:newswire/20000410_nyt-NEW.txt")

Name:./masc_500k_texts/written/newspaper:newswire/20000410_nyt-NEW.txt

Nearest documents:

Name:./masc_500k_texts/written/newspaper:newswire/20000419_apw_eng-NEW.txt
Similarity:0.843858599663

Name:./masc_500k_texts/written/newspaper:newswire/20000415_apw_eng-NEW.txt
Similarity:0.820624470711

Name:./masc_500k_texts/written/newspaper:newswire/wsj_1640.mrg-NEW.txt
Similarity:0.81544983387

Name:./masc_500k_texts/written/newspaper:newswire/wsj_2465.txt
Similarity:0.795952379704

Name:./masc_500k_texts/written/newspaper:newswire/20000424_nyt-NEW.txt
Similarity:0.788309693336

Name:./masc_500k_texts/written/govt-docs/chapter-10.txt
Similarity:0.787995398045

Name:./masc_500k_texts/written/newspaper:newswire/NYTnewswire3.txt
Similarity:0.754826664925

Name:./masc_500k_texts/written/newspaper:newswire/wsj_0027.txt
Similarity:0.702040672302

Name:./masc_500k_texts/written/newspaper:newswire/NYTnewswire7.txt
Similarity:0.687466025352

Name:./masc_500k_texts/written/newspaper:newswire/20020731

## Infer trained vectors on an external corpus

In [9]:
def infer_text_vectors(directory,model):
    filenames=get_files(directory)
    doc=[]
    text_dict={}
    for filename in filenames:
        f=open(filename,"r")
        string=f.read()
        text_dict[filename]=model.infer_vector(string)
    return text_dict

## Find nearest document in a corpus charcterised by a dictionary[document,vector]

In [10]:
def find_nearest_document(vector_dict,string,model):
    x=model.infer_vector(string);
    min_norm=1e8
    nearest_file=""
    for i in vector_dict.keys():
        temp=np.linalg.norm(vector_dict[i]-x)
        if(temp<min_norm):
            min_norm=temp
            nearest_file=i
    print(min_norm,nearest_file)

### From here:
#### 1. Given a training corpus and an external corpus, a model can be trained on the training corpus and used to infer the document vectors of the external corpus (keeping word vector representations the same), thus allowing us to cluster documents based on vector norms
#### 2. We can also use train another model on the ekstep corpus and use it to find the nearest document in that corpus as well