In [None]:
import pandas as pd
import numpy as np
import pickle
import re
import math
import scipy
from nltk.corpus import stopwords, words
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from scipy import linalg
from textblob import TextBlob
from gensim import corpora, models, similarities, matutils
from nltk.tokenize import RegexpTokenizer

Total_actual_docs consists of cleaned and tagged data for english and spanish text. This is considered as input for the model.

eg: but thou hast a few names in sardis that did not defile their garments and they shall walk with me in white for they are worthy __mas__ __tienes__ __unas__ __pocas__ __personas__ __en__ __sardis__ __que__ __no__ __han__ __ensuciado__ __sus__ __vestiduras__ __y__ __andarán__ __conmigo__ __en__ __vestiduras__ __blancas__ __porque__ __son__ __dignos__

In [None]:
with open("total_actual_docs.pkl",'rb') as file1:
    total_actual_docs = pickle.load(file1)

### Now that we have all the data, build LSI model to generate term matrix
We can use either sklearn's CountVectorizer or TfidfVectorizer to calculate tfidf matrix. Here we will be using CountVectorizer and set min_df=10 because we want to ignore words that appear in less than 10 documents.

In [None]:
count_vectorizer = CountVectorizer(analyzer='word', encoding='utf-8', decode_error='ignore', min_df=10)

In [None]:
# Create the term-document matrix
# Transpose it so the terms are the rows

doc_vecs = count_vectorizer.fit_transform(total_actual_docs).transpose()
doc_vecs.shape

In [None]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_vecs)

In [None]:
# Calculate id2word using count_vec
id2word = dict((v, k) for k, v in count_vec.vocabulary_.iteritems())

In [None]:
# Create a TFIDF transformer from our word counts (equivalent to "fit" in sklearn)
tfidf = models.TfidfModel(corpus)

In [None]:
# Create a TFIDF vector for all documents from the original corpus ("transform" in sklearn)
tfidf_corpus = tfidf[corpus]

#### Now that we have tfidf and id2word matrices, we can build LSI model. Since my data is huge, I am going to use gensim distributed on a 32 core machine on AWS with a chunksize of 50000 to build LSI vector space

In [None]:
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=100, chunksize= 50000, distributed=True)

In [None]:
# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus]

#### Seperate English and Spanish terms from the original terms to get the term vectors and to calculate similarity matrix for each. id2word consists of all the terms in the original data with their corresponding indicies.

In [None]:
text_blobs = []
for k,v in id2word.iteritems():
    text_blobs.append(v)

len(text_blobs)

In [None]:
sp_list = []
en_list = []
for word in text_blobs:
    if word.startswith('__') or word.endswith('__'):
        sp_list.append(word)
    else:
        en_list.append(word)

In [None]:
#Build similarity matrix for english and spanish terms
# Get matrix of counts
test_vecs_sp = count_vec.transform(sp_list).transpose()
# Convert to gensim corpus
test_corpus_sp = matutils.Sparse2Corpus(test_vecs_sp)
# TFIDF transformation
test_tfidf_sp = tfidf[test_corpus_sp]
# LSI transformation
test_lsi_sp = lsi[test_tfidf_sp]
# Create an index transformer that calculates similarity based on our space
test_index_sp = similarities.MatrixSimilarity(test_lsi_sp)
test_index_sp.num_best = 5

# Build LSI and matrix similarity for engish terms
test_vecs_en = count_vec.transform(en_list).transpose()
test_corpus_en = matutils.Sparse2Corpus(test_vecs_en)
test_tfidf_en = tfidf[test_corpus_en]
test_lsi_en = lsi[test_tfidf_en]
test_index_en = similarities.MatrixSimilarity(test_lsi_en)
test_index_en.num_best = 5

#### term_folding_pipeline function is used to fold new terms into the existing LSI sapce to get the corresponding vectors 

In [None]:
def term_folding_pipeline(term):
    # Get matrix of counts
    test_vecs1 = count_vec.transform(term).transpose()
    # Convert to gensim corpus
    test_corpus1 = matutils.Sparse2Corpus(test_vecs1)
    # TFIDF transformation
    test_tfidf1 = tfidf[test_corpus1]
    # LSI transformation
    test_lsi1 = lsi[test_tfidf1]
    return test_lsi1

##### create a english to spanish dictionary by getting terms with highest similarity scores. Similarly we can compute spanish to english dictionary too.

In [None]:
translation_new_en = defaultdict(list)
for word in en_list:
    if len(word) < 1:
        continue
    en_term_lsi = term_folding_pipeline([word])
    translation_list = []
    score = test_index_sp[en_term_lsi]
    for i in range(len(score[0])):
        translation_list.append(sp_list[score[0][i][0]].strip('__'))
    translation_new_en[word] = translation_list