In [32]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle
import re
import string
from numpy import argmax

path_to_vectorizer = 'vectorizer'
path_data_frame = 'parallelCorpus'
path_to_document_tfidf = 'source-tfidf'
threshold = .7

def removeSingleDoubleCharacterWordStopWordsAndStemming(tokens):
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = [ps.stem(token) for token in tokens if token not in stop_words] 
#     print(tokens) 
    return ' '.join([token for token in tokens if len(token)>2 and token not in stop_words])

def removePunctuationAndGetTokens(lines):
    pattern = re.compile('[0-9].*')
    lines = re.sub(pattern,' ',lines)
    translator = str.maketrans(string.punctuation,' '*len(string.punctuation))
    lines = lines.translate(translator)
    tokens = lines.split()
    return tokens

def calculate_tfidf_of_documents(dataframe_file):
    df = load_dataFrame(dataframe_file)
    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf_documents=tfidf_vectorizer.fit_transform(df['English'].apply(removeSingleDoubleCharacterWordStopWordsAndStemming).values.astype('U'))
    return tfidf_vectorizer,tfidf_documents

def load_dataFrame(filePath):
    return pd.read_json(filePath+'.json')

def calculate_tfidf_of_query(query,tfidf_vectorizer):
    processed_query = removeSingleDoubleCharacterWordStopWordsAndStemming(removePunctuationAndGetTokens(query))
    return tfidf_vectorizer.transform([query])

def main():
    vectorizer, doc_tf_idf = calculate_tfidf_of_documents(path_data_frame)
    save_vectors(vectorizer,path_to_vectorizer)
    save_vectors(doc_tf_idf,path_to_document_tfidf)
    
def save_vectors(vector,filename):
    pickle.dump(vector, open(filename+'.pk', 'wb'),protocol=pickle.HIGHEST_PROTOCOL)
    
def get_relevant_document_list(query):
    docu_df = load_dataFrame(path_data_frame)
    vectorizer = load_vectors(path_to_vectorizer)
    query_tfidf = calculate_tfidf_of_query(query,vectorizer)
    doc_tfidf = load_vectors(path_to_document_tfidf)
    similarity = cosine_similarity(doc_tfidf,query_tfidf)
    flat = similarity.flatten()
    index = argmax(flat)
    probability = flat[index])
    return ' '.join(docu_df.iloc[index,1]),probability

def load_vectors(filename):
    with open(filename+'.pk', 'rb') as fp:
        return pickle.load(fp)

def talk_like_shakespeare():
    input_sentence = input('English: ')
    output_sentence, probability = get_relevant_document_list(input_sentence)
    if(probability > threshold):
        print('Shakespeare: {}'.format(output_sentence))
    else:
        # call your model here and display output
    
    
# query = input()
get_relevant_document_list(query)

In [28]:
def load_dataFrame(filePath):
    return pd.read_json(filePath+'.json')

df = load_dataFrame('parallelCorpus')

df.iloc[999,:]
vectorizer = load_vectors(path_to_vectorizer)
calculate_tfidf_of_query('a jumbled confession can only receive a jumbled absolution',vectorizer)


<1x1096 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>