In [None]:
from flask import Flask, render_template, request
from wtforms import (Form, FileField, StringField, TextAreaField, validators, SubmitField, IntegerField, MultipleFileField)
from keras.models import load_model
import tensorflow as tf
import html
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pdfplumber
from docx2pdf import convert
import os
import pandas as pd
import pythoncom

# Create app
app = Flask(__name__)

class ReusableForm1(Form):
    """Document Classifier - DCNN Model"""
    #Document name
    docName = StringField(u'Enter document name:', validators=[
                     validators.InputRequired()])
    # Document content
    docContent = TextAreaField(u'Enter document content:', validators=[
                     validators.InputRequired()])
    
    # Submit button
    submit1 = SubmitField("Submit")

class ReusableForm2(Form):
    """Document Classifier - DCNN Model"""
    #Document name
    docFiles = MultipleFileField(u'', validators=[
                     validators.InputRequired()])
    
    # Submit button
    submit2 = SubmitField("Submit")
    
def load_dcnn_model():
    """Load in the pre-trained model"""
    global cnnmodel
    cnnmodel = load_model('DCNN_Model/dcnnmodel')
    

def cleanTextAndEmbedings(docContent):
    df = pd.DataFrame({'content': [docContent]})
    #1. Remove newline characters
    df = df.replace(r'\n', '', regex=True)
    #2.Remove spcial characters
    spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":",";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "~", "{", "|", "}"]
    for char in spec_chars:
        df['content'] = df['content'].str.replace(char, ' ')
    #3.HTML escape characters
    df = html.escape(df)
    #5. Convert all letters to lowercase
    df['content'] = df['content'].str.lower()
    #6. Remove stopwords
    stop_words = set(stopwords.words('english'))
    lst_contents = []
    for contents in df['content']:
        word_tokens = word_tokenize(contents)
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        txt_sentence = " ".join(filtered_sentence)
        lst_contents.append(txt_sentence)
    df['content_without_stopwords'] = lst_contents

    tokenizer = Tokenizer(num_words=5000,lower=False)
    tokenizer.fit_on_texts(df['content_without_stopwords'])
    X_test = tokenizer.texts_to_sequences(df['content_without_stopwords'])
    X_test = pad_sequences(X_test, padding='post', maxlen=10)
    return X_test

def extractTextFromPDF(fileName):
    textContent=''
    with pdfplumber.open(fileName) as pdf:
        for page in pdf.pages:
            textContent += page.extract_text()
    return textContent

def covertDOCXToPDF(filename):
    split_tup = os.path.splitext(filename)
    file_name = split_tup[0]
    file_extension = split_tup[1]
    pythoncom.CoInitialize()
    convert(filename)
    return file_name+".pdf"

def extractTextFromExcel(filename):
    df = pd.read_excel(filename, index_col=0)  
    textContent=df.to_string()
    return textContent

# Home page
@app.route("/index", methods=['GET', 'POST'])
def index():
    """Home page of app with form"""
    # Create form
    form1 = ReusableForm1(request.form)
    docCategory=''
    # On form entry and all conditions met
    if request.method == 'POST' and form1.validate():
        # Extract information
        docName = request.form['docName']
        docContent = request.form['docContent']        
        if docName is not None and docContent is not None:
            X_test = cleanTextAndEmbedings(docContent);
            y_pred=cnnmodel.predict(X_test)
            print("y_pred: ", y_pred)
            if y_pred.argmax()==0:
                docCategory='Public'
            if y_pred.argmax()==1:
                docCategory='Private'
            if y_pred.argmax()==2:
                docCategory='Confidential'
            print("docCategory: "+docCategory)
            return render_template('classification_result.html', input=docCategory)           

    # Send template information to index.html
    return render_template('index.html', form=form1)

@app.route('/selectfiles', methods=['GET', 'POST'])
def selectfiles():
    # Create form
    form2 = ReusableForm2(request.form)
    repName='./Docs/'
    docCategory=''
    data = []
    # On form entry and all conditions met
    if request.method == 'POST' and form2.validate():
        if form2.docFiles.data:
            files = form2.docFiles.data
            for file in files:
                split_tup = os.path.splitext(file)
                file_name = split_tup[0]
                file_extension = split_tup[1]
                print(file_extension)
                docContent=''
                if file_extension== '.pdf':
                    docContent = extractTextFromPDF(repName+file)
                elif file_extension== '.docx':
                    convertedFile = covertDOCXToPDF(repName+file)
                    docContent = extractTextFromPDF(convertedFile)
                    os.remove(convertedFile)
                elif file_extension== '.xlsx':
                    docContent = extractTextFromExcel(repName+file)
                        
                X_test = cleanTextAndEmbedings(docContent)
                y_pred=cnnmodel.predict(X_test)
                print("y_pred: ", y_pred)
                if y_pred.argmax()==0:
                    docCategory='Public'
                if y_pred.argmax()==1:
                    docCategory='Private'
                if y_pred.argmax()==2:
                    docCategory='Confidential'
                print("docCategory: "+docCategory)
                data.append([file, docCategory])
            df = pd.DataFrame(data, columns=['File name', 'Category'])
            print(df)
            return render_template('multiclassification_result.html',  tables=[df.to_html(classes='data')], titles=df.columns.values)
               
    return render_template('selectfiles.html', form=form2)

if __name__ == "__main__":
    load_dcnn_model()
    app.run(debug=True)

Loading DCNN and Flask starting server...please wait until server has fully started
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on all addresses.
 * Running on http://192.168.0.197:9090/ (Press CTRL+C to quit)
192.168.0.197 - - [13/Apr/2022 07:52:03] "GET / HTTP/1.1" 404 -
192.168.0.197 - - [13/Apr/2022 07:52:04] "GET /favicon.ico HTTP/1.1" 404 -
192.168.0.197 - - [13/Apr/2022 07:52:10] "GET /selectfiles HTTP/1.1" 200 -
192.168.0.197 - - [13/Apr/2022 07:52:10] "GET /static/css/main.css HTTP/1.1" 200 -
192.168.0.197 - - [13/Apr/2022 07:52:10] "GET /static/images/script_background.png HTTP/1.1" 404 -


.docx


  0%|          | 0/1 [00:00<?, ?it/s]

  df['content'] = df['content'].str.replace(char, ' ')


y_pred:  [[0.5726998  0.45695817 0.4841933 ]]
docCategory: Public
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.28622955 0.26376683 0.5990296 ]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.5076379  0.38886797 0.50690025]]
docCategory: Public
.pdf
y_pred:  [[0.45981643 0.4621031  0.55954236]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.51635855 0.4093312  0.4989858 ]]
docCategory: Public
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.52156675 0.37169415 0.47824478]]
docCategory: Public
.xlsx


  df['content'] = df['content'].str.replace(char, ' ')


y_pred:  [[0.41628128 0.540142   0.59522957]]
docCategory: Confidential
.pdf
y_pred:  [[0.39093205 0.3802577  0.55971223]]
docCategory: Confidential
.pdf
y_pred:  [[0.5395212  0.45389116 0.5037206 ]]
docCategory: Public
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.5892679 0.4407937 0.4716242]]
docCategory: Public
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.5323969  0.4280356  0.49558035]]
docCategory: Public
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.5021175  0.44226158 0.5244861 ]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.55520487 0.64327836 0.6016597 ]]
docCategory: Private
.pdf
y_pred:  [[0.50041246 0.4357291  0.52404505]]
docCategory: Confidential
.pdf
y_pred:  [[0.4503647  0.37389487 0.52429354]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.5447906  0.41477883 0.48454067]]
docCategory: Public
.pdf
y_pred:  [[0.40309203 0.40951282 0.574381  ]]
docCategory: Confidential
.doc
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.46792594 0.41831142 0.5285881 ]]
docCategory: Confidential
.pdf
y_pred:  [[0.52797765 0.4504402  0.5177697 ]]
docCategory: Public
.pdf
y_pred:  [[0.28741038 0.2439821  0.58455396]]
docCategory: Confidential
.pdf
y_pred:  [[0.7794057  0.37714878 0.3352645 ]]
docCategory: Public
.pdf
y_pred:  [[0.50595886 0.43505484 0.5189936 ]]
docCategory: Confidential
.xlsx


  df['content'] = df['content'].str.replace(char, ' ')


y_pred:  [[0.5354158  0.42827675 0.4936454 ]]
docCategory: Public
.xlsx


  df['content'] = df['content'].str.replace(char, ' ')


y_pred:  [[0.31576186 0.469736   0.6502137 ]]
docCategory: Confidential
.pdf
y_pred:  [[0.4145329 0.4114222 0.5632884]]
docCategory: Confidential
.pdf
y_pred:  [[0.46807706 0.42188954 0.542937  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.5284669  0.39672276 0.49738342]]
docCategory: Public
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.57740426 0.48186782 0.49724647]]
docCategory: Public
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.38169354 0.3439547  0.5606374 ]]
docCategory: Confidential
.pdf
y_pred:  [[0.33804193 0.4826333  0.63526845]]
docCategory: Confidential
.pdf
y_pred:  [[0.33804193 0.4826333  0.63526845]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.3736049  0.38528368 0.59563637]]
docCategory: Confidential
.pdf
y_pred:  [[0.56074995 0.5075308  0.51848274]]
docCategory: Public
.pdf
y_pred:  [[0.4251212  0.40071923 0.55702937]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.3537051  0.460615   0.64335054]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.5504359  0.56321067 0.55313474]]
docCategory: Private
.pdf
y_pred:  [[0.5504359  0.56321067 0.55313474]]
docCategory: Private
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.52646303 0.40881512 0.4862433 ]]
docCategory: Public
.pdf
y_pred:  [[0.46361524 0.40382078 0.52485436]]
docCategory: Confidential
.pdf
y_pred:  [[0.7121802  0.6860228  0.54205257]]
docCategory: Public
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.484

  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.2513851  0.26724997 0.623335  ]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.29828066 0.3555061  0.625556  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.55716777 0.44321048 0.48742115]]
docCategory: Public
.pdf
y_pred:  [[0.3651772  0.47655278 0.66026014]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.33804193 0.4826333  0.63526845]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.5331139  0.57268614 0.5834077 ]]
docCategory: Confidential
.pdf
y_pred:  [[0.53126556 0.50231165 0.5442172 ]]
docCategory: Confidential
.rtf
y_pred:  [[0.44763243 0.48496914 0.549181  ]]
docCategory: Confidential
.pdf
y_pred:  [[0.21373406 0.24511448 0.6410127 ]]
docCategory: Confidential
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

y_pred:  [[0.58706754 0.4300145  0.4672076 ]]
docCategory: Public
.pdf
y_pred:  [[0.3183018 0.4518428 0.6425329]]
docCategory: Confidential
                                            File name      Category
0        0. DL-2021-22-Odd-sem-Course File index.docx        Public
1                    1. Institute Vision Mission.docx  Confidential
2                                 2. IT MV Final.docx        Public
3                                  3. AT Syllabus.pdf  Confidential
4   5. Automata Theory University Question Papers....        Public
..                                                ...           ...
66              TEIT Roll List Batch wise 2021-22.pdf  Confidential
67                       The Nationalist Movement.rtf  Confidential
68                                     Time Table.pdf  Confidential
69              Updated Institute Mission Vision.docx        Public
70                              VIII NEW PDF 2021.pdf  Confidential

[71 rows x 2 columns]


192.168.0.197 - - [13/Apr/2022 08:06:21] "POST /selectfiles HTTP/1.1" 200 -
192.168.0.197 - - [13/Apr/2022 08:06:21] "GET /static/images/script_background.png HTTP/1.1" 404 -
192.168.0.197 - - [13/Apr/2022 08:10:55] "GET /selectfiles HTTP/1.1" 200 -


.pdf


  df['content'] = df['content'].str.replace(char, ' ')


y_pred:  [[0.7794057  0.37714878 0.3352645 ]]
docCategory: Public
.pdf
y_pred:  [[0.5504359  0.56321067 0.55313474]]
docCategory: Private
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

192.168.0.197 - - [13/Apr/2022 08:12:04] "POST /selectfiles HTTP/1.1" 200 -
192.168.0.197 - - [13/Apr/2022 08:12:04] "GET /static/images/script_background.png HTTP/1.1" 404 -


y_pred:  [[0.2513851  0.26724997 0.623335  ]]
docCategory: Confidential
                           File name      Category
0  Deep Learning Theory Syllabus.pdf        Public
1             Prerequisite Marks.pdf       Private
2                 Salary-Slip-2.docx  Confidential


192.168.0.197 - - [13/Apr/2022 08:12:18] "GET /selectfiles HTTP/1.1" 200 -


.pdf


  df['content'] = df['content'].str.replace(char, ' ')


y_pred:  [[0.7794057  0.37714878 0.3352645 ]]
docCategory: Public
.pdf
y_pred:  [[0.5504359  0.56321067 0.55313474]]
docCategory: Private
.docx


  0%|          | 0/1 [00:00<?, ?it/s]

192.168.0.197 - - [13/Apr/2022 08:15:23] "POST /selectfiles HTTP/1.1" 200 -
192.168.0.197 - - [13/Apr/2022 08:15:23] "GET /static/images/script_background.png HTTP/1.1" 404 -


y_pred:  [[0.2513851  0.26724997 0.623335  ]]
docCategory: Confidential
                           File name      Category
0  Deep Learning Theory Syllabus.pdf        Public
1             Prerequisite Marks.pdf       Private
2                 Salary-Slip-2.docx  Confidential


192.168.0.197 - - [13/Apr/2022 08:31:51] "GET /selectfiles HTTP/1.1" 200 -
192.168.0.197 - - [13/Apr/2022 08:31:51] "GET /static/images/script_background.png HTTP/1.1" 404 -
