In [1]:
import docx2txt
import re
import numpy as np
import os

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import h5py
import multiprocessing

import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
import os

from flask import Flask, render_template, request, redirect, url_for, jsonify, make_response
from werkzeug.utils import secure_filename



In [2]:
def txt_process(word_document):    
    fs = [docx2txt.process(word_document)]
    
    data = [' '.join(i.split()) for i in fs] #strip white spaces

    data1 = [re.sub(r'[^\w\s]',' ',i) for i in data] #removes punctuation

    data2 = []

    for j in range(0,len(data1)):
        data2.append(''.join(i for i in data1[j] if i.isdigit() == False)) #removes numbers

    data3 = [' '.join(i.split()) for i in data2]

    gen_docs = [[w.lower() for w in word_tokenize(text)] for text in data3] #tokenize each document

    gendocs1 = []
    for i in range(0,len(gen_docs)):
        gendocs1.append([w for w in gen_docs[i] if not w in stop_words]) #removes stop words

    txt_fnlwrds = []
    for j in range(0,len(data3)):
        txt_fnlwrds.append([i for i in gendocs1[j] if len(i) >= 3 ]) #removes words which have <3 letters in it

    return txt_fnlwrds

In [3]:
#inference hyper-parameters
start_alpha=0.01
infer_epoch=1000

In [4]:
#loading pre-trained model
gen_model = os.getcwd() + "\Doc2Vec pretrained\model\doc2vec.bin"
model = gensim.models.Doc2Vec.load(gen_model)

In [5]:
#Creating a H5 file for file names or labels

dummyLabel = ["Dummy"]
labelVec = np.array(dummyLabel, dtype=h5py.special_dtype(vlen=str))

label_h5f = h5py.File('label_h5f.h5', 'w')
label_h5f.create_dataset('labelDataset', data =labelVec, maxshape=(None,), chunks=True)
label_h5f.close()

In [6]:
label_h5f = h5py.File('label_h5f.h5','r')
existLabel = label_h5f['labelDataset'][:]
label_h5f.close()

In [7]:
#Creating a H5 file for document vectors

dummyTxt = "This is dummy text to initiate H5 File"
dummyVec = model.infer_vector(dummyTxt.split(), alpha=start_alpha, steps=infer_epoch).reshape(1,-1)

vec_h5f = h5py.File('vec_h5f.h5', 'w')
vec_h5f.create_dataset('vecDataset', data = dummyVec, maxshape=(None, None), chunks=True)
vec_h5f.close()

In [8]:
vec_h5f = h5py.File('vec_h5f.h5','r')
existVec = vec_h5f['vecDataset'][:]
vec_h5f.close()

In [9]:
app = Flask(__name__, template_folder='C:/Users/gkottur/Documents/FI/AutoCodeFS/Scripts/')

@app.route('/')
def index():
    return render_template("index.html")

@app.route('/upload', methods=['POST'])
def upload():
    file = request.files['file']
    newLabel = np.array([file.filename], dtype=h5py.special_dtype(vlen=str))
    #Appends the file name of the file uploaded to label_h5f.h5
    with h5py.File('label_h5f.h5', 'r+') as label:
        label["labelDataset"].resize((label["labelDataset"].shape[0] + newLabel.shape[0]), axis = 0)
        label["labelDataset"][-newLabel.shape[0]:] = newLabel
    
    docPreproc = txt_process(file)
    docPreproc1 = [item for sublist in docPreproc for item in sublist]
    newVec = model.infer_vector(docPreproc1, alpha=start_alpha, steps=infer_epoch).reshape(1,-1)
    #Appends the vector of the uploaded document to vec_h5f.h5
    with h5py.File('vec_h5f.h5', 'r+') as vec:
        vec["vecDataset"].resize((vec["vecDataset"].shape[0] + newVec.shape[0]), axis = 0)
        vec["vecDataset"][-newVec.shape[0]:] = newVec
        
    return "H5 successfully appended"


if __name__ == '__main__':
    app.run(host='localhost', port=5000, threaded=True)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://localhost:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [26/Jun/2018 11:36:06] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Jun/2018 11:36:13] "POST /upload HTTP/1.1" 200 -
