In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import string
import re

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /nfshome/tn1050/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /nfshome/tn1050/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [86]:
!ls data/train_test/files/text*subset

1938.txt 1940.txt 348.txt  381.txt  385.txt  485.txt
1939.txt 347.txt  349.txt  384.txt  483.txt  486.txt


### Word2Vec

In [207]:
def gen_token(file, word2vec_model, manual_stopwords = ['www', 'org', 'vol']):
    '''
    INPUT:
    file - path of a text file
    manual_stopwords - additional stopwords, set manually
    
    OUTPUT: 
    Preprocessed and tokenized text
    '''
    
    #read text file
    raw = open(file_path+file, 'r')
    text = raw.read().replace("-\n", "").replace("\n", " ")
    
    #lower all letters
    lowers = text.lower()
    
    #remove all numbers
    no_num = re.sub(r'\d+', '', lowers)

    #tokenize text
    tokenizer = RegexpTokenizer(r'\w+')
    text_tokens = tokenizer.tokenize(no_num)
    
    #remove stopwords
    filtered = [w for w in text_tokens if not w in stopwords.words('english')+manual_stopwords]
    filtered2 = [c for c in filtered if len(c) > 2]
    
    #stemming
    stemmer = PorterStemmer()
    stemmed = stem_tokens(filtered2, stemmer)
    
    #removing words not in word2vec
    output = [w for w in stemmed if w in word2vec_model.vocab]
    
    return output

def VectorizeDoc(filename, model):
    '''
    INPUT:
    filename - for instance "384.txt"
    model - word2vec model
    
    OUTPUT:
    Vectorized document (mean of all word vectors in the document)
    '''
    
    doc = gen_token(filename, model)
    word_vecs = []
    for word in doc:
        word_vecs.append(model[word])
    
    doc_vec = np.mean(word_vecs, axis = 0)
    return doc_vec    

def cosine_sim(vecA, vecB):
    """Find the cosine similarity distance between two vectors."""
    csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
    if np.isnan(np.sum(csim)):
        return 0
    return csim

def doc_sim(doc1, doc2):
    '''
    INPUT:
    doc1 and doc2 filenames
    
    OUTPUT:
    cosine similarity between these documents
    '''
    vec1 = VectorizeDoc(doc1, model)
    vec2 = VectorizeDoc(doc2, model)
    return cosine_sim(vec1, vec2)
    

In [4]:
from gensim.models.keyedvectors import KeyedVectors

googlenews_model_path = 'data/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(googlenews_model_path, binary=True)

from DocSim import DocSim
ds = DocSim(model)

In [7]:
import pandas as pd
from tqdm import tqdm

pubs = pd.read_json('data/train_test/publications.json')

In [15]:
print(pubs.shape)
pubs.head(1)

(5000, 6)


Unnamed: 0,pdf_file_name,pub_date,publication_id,text_file_name,title,unique_identifier
0,105.pdf,1969-01-01,105,105.txt,Cheap Talk? Financial Sanctions and Non-Financ...,bbk-4


In [26]:
import logzero
from logzero import logger

logzero.logfile('tmp/output.log', backupCount=3)

In [31]:
path = 'data/train_test/files/text/'
for i in tqdm(range(5000)):
    for j in range(i+1, 5000):
        tmp = ds.calculate_similarity(path+pubs['text_file_name'][i],
                                      path+pubs['text_file_name'][j])
        print([pubs['text_file_name'][i], pubs['text_file_name'][j], tmp])
        break
    break

  0%|          | 0/5000 [00:00<?, ?it/s]

['105.txt', '109.txt', []]





In [None]:
import os
from itertools import combinations

file_list = []
file_path = 'data/train_test/files/text/'
for subdir, dirs, files in os.walk(file_path):
    for file in files:
        file_list.append(file)
        
file_comb = combinations(file_list, 2)

In [205]:
import time
start_time = time.clock()
doc_sims_output = []
processing = 0
for comb in file_comb:
    doc_sims_output.append([comb[0], comb[1], doc_sim(comb[0], comb[1])])
    processing += 1
    if processing%10000 == 0:
        print("Runtime: {}s\nProcessing: {}".format(start_time - time.clock(), processing))




KeyboardInterrupt: 