In [1]:
path = !pwd
path = path[0]

if path != '/scratch/ss15592/NLP-Project':
    %cd '/scratch/ss15592/NLP-Project'
    path = !pwd

print('Working Directory: ', path)

Working Directory:  /scratch/ss15592/NLP-Project


In [2]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
import pathlib, sys, os, glob, pickle, math, shutil
import nltk
import numpy as np
nltk.download('punkt')
import gensim
from operator import itemgetter
import re
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(path, 'nfcorpus')

  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package punkt to /home/ss15592/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def create_working_directory():           
    CONTENT_PATH = './inputs/contents/'
    TOKENS_PATH = './inputs/tokens/'
    CENTROIDS_PATH = './inputs/centroids/'
    DOC_NUMBER_PATH = './inputs/doc_num/'
    MODEL_PATH = './model'
            
    if os.path.exists(CONTENT_PATH):
        shutil.rmtree(CONTENT_PATH)
    os.makedirs(CONTENT_PATH)

    if os.path.exists(TOKENS_PATH):
        shutil.rmtree(TOKENS_PATH)
    os.makedirs(TOKENS_PATH)

    if os.path.exists(CENTROIDS_PATH):
        shutil.rmtree(CENTROIDS_PATH)
    os.makedirs(CENTROIDS_PATH)

    if os.path.exists(DOC_NUMBER_PATH):
        shutil.rmtree(DOC_NUMBER_PATH)
    os.makedirs(DOC_NUMBER_PATH)

    if os.path.exists(MODEL_PATH):
        shutil.rmtree(MODEL_PATH)
    os.makedirs(MODEL_PATH)
        
    return [CONTENT_PATH, TOKENS_PATH, CENTROIDS_PATH, DOC_NUMBER_PATH, MODEL_PATH]

In [4]:
def preprocessing(preprocess):
    PATH = create_working_directory()
    data_path = util.download_and_unzip(url, out_dir)
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="train")

    count = 0
    if preprocess:
        CONTENT_PATH = PATH[0]
        TOKENS_PATH = PATH[1]
        CENTROIDS_PATH = PATH[2]
        DOC_NUMBER_PATH = PATH[3]

        for key in corpus.keys():
            content = corpus[key]['text']
            countWords = len(content.split())
            
            if countWords >= 30:
                title = corpus[key]['title']
                doc_num = key  
                count = count + 1
                content_name = CONTENT_PATH + str(count) + '.txt'
                token_name = TOKENS_PATH + str(count) + '.tokens'
                doc_name = DOC_NUMBER_PATH + str(count) + '.txt'

                cf = open(content_name, 'w', encoding="utf-8")
                cf.write(content)
                cf.close()

                df = open(doc_name, 'w', encoding="utf-8")
                df.write(doc_num)
                df.close()

                tf = open(token_name, 'w', encoding="utf-8")
                sentence_tokens = nltk.sent_tokenize(content)
                for sentence in sentence_tokens:
                    sentence = sentence.lower()
                    sentence = re.sub(r'[^\w\s]', '', sentence)
                    tf.write(sentence)
                tf.close()

    print("Total Document count is: ", count)

## **Train word2Vec model**

In [5]:
def Train(train, path):
    if train:
        sent_iterator = gensim.models.word2vec.PathLineSentences(path)
        model = gensim.models.Word2Vec(sent_iterator, 
                                       vector_size=200,
                                       min_count=1, 
                                       workers=6)
        
        model.save('./model/w2v-lc.model')
        model.wv.save_word2vec_format('./model/w2v-lc.model.bin', binary=True)
    
        return model

In [6]:
def get_embedding(x, vector_size, model, out=False):
    if x in model.wv and out == True:
        return model.syn1neg[model.wv.key_to_index[x]]
    
    elif x in model.wv and out == False:
        return model.wv[x]
    
    else:
        return np.zeros(vector_size)

In [7]:
def centroid(model):
    for fname in glob.iglob('./inputs/tokens/*.tokens', recursive=False):
        sent_iterator = gensim.models.word2vec.PathLineSentences(fname)
        document = []
        for line in sent_iterator:
            document += line
            
        centroid_in = (np.mean(np.array([get_embedding(x, 
                                                       model.vector_size, 
                                                       model) for x in document]), axis=0))

        centroid_out = (np.mean(np.array([get_embedding(x, 
                                                        model.vector_size, 
                                                        model, 
                                                        out=True) for x in document]), axis=0))

        out_dict = { fname : (centroid_in, centroid_out) }
        pickle_file = './inputs/centroids/' + os.path.basename(fname).replace('.tokens', '.p')
        pickle.dump(out_dict, open(pickle_file, "wb"))

In [8]:
preprocessing(True)

  0%|          | 0/3633 [00:00<?, ?it/s]

Total Document count is:  3624


In [9]:
model = Train(True, path='./inputs/tokens/')

In [10]:
model = gensim.models.Word2Vec.load("./model/w2v-lc.model")
centroid(model)