In [1]:
%%writefile models_summarizer.py


import nltk
import json
import string 
import re
import en_core_web_sm
import spacy
import emoji
import torch 

from transformers import AutoModelWithLMHead, AutoTokenizer

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS


MAX_LEN = 512
SUMMARY_LEN = 150

nlp = en_core_web_sm.load()
stopwords = list(STOP_WORDS)

punctuations = string.punctuation
wordnet_lemmatizer = WordNetLemmatizer()

model1 = AutoModelWithLMHead.from_pretrained("sshleifer/distilbart-cnn-12-6", )
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
path = "./Trained_Models/summarization_bart_model.pt"
model1.load_state_dict(torch.load(path))
    

def clean_text(sent):
    sentence = sent.strip()
#     sentence = sentence.lower()
    tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in tokens if not (token in punctuations or token in stopwords)]
    
    lemmatized_token = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    new_sentence = " ".join(lemmatized_token)
    return new_sentence

### Web scraping

def scrape_web_data(link):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

#     links = df_links["Link"]

    tags = ["span","title","time","p", "h4"]
    c = 0
    summ_li = []
#     for link in links:
    source = Request(url = link, headers = headers)
    html = urlopen(source).read()
    soup = BeautifulSoup(html)
    para = ""
    for elem in soup():    
        if((elem.name in tags)):
            text = elem.text
            para += text +" "
    
    para = para.strip()
    para = para.replace("\n", " ")
    para = re.sub(r'<.*?>', '', para)
    para = emoji.get_emoji_regexp().sub(r"", para)  
    
    return para

### function definition to predict the summaries
def predict_summaries(text):
    article_input_ids = tokenizer.batch_encode_plus([text], max_length= MAX_LEN, pad_to_max_length=True,return_tensors='pt')
    summary_ids = model1.generate(
                input_ids = article_input_ids['input_ids'], 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )

    summary_txt = [tokenizer.decode(g , skip_special_tokens=True) for g in summary_ids]
    return (" ".join(summary_txt))




Overwriting models_summarizer.py


In [2]:
%%writefile models_QnA.py

from transformers import BertTokenizerFast
from transformers import BertForQuestionAnswering
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers.pipelines import pipeline

tokenizer_QnA = AutoTokenizer.from_pretrained("./Trained_Models/my_model3")
model_QnA = AutoModelForQuestionAnswering.from_pretrained("./Trained_Models/my_model3")

def answer(text, question):
    nlp_pipline = pipeline('question-answering', model=model_QnA, tokenizer=tokenizer_QnA)
    nlp_input = {'question': question, 'context': text}
    result = nlp_pipline(nlp_input)
    return result['answer']


Overwriting models_QnA.py


In [2]:
%%writefile app.py

from flask import Flask, render_template, url_for, request, redirect, jsonify, Response
# from flask_sqlalchemy import SQLAlchemy
import flask_excel as excel
from datetime import datetime
import pandas as pd
from werkzeug.utils import secure_filename
from flask_uploads import UploadSet, configure_uploads, DOCUMENTS, IMAGES

import pandas as pd
import torch
import time

from models_summarizer import predict_summaries, scrape_web_data
from models_QnA import answer



app = Flask(__name__)
docs = UploadSet('datafiles', DOCUMENTS)
app.config['UPLOADED_DATAFILES_DEST'] = 'static/uploads'
configure_uploads(app, docs)

@app.route("/", methods = ['GET', "POST"])
def index():
    if(request.method == "POST"):
        option = request.form["links"]
        
        if(option == "single_link"):
            try:    
                link = request.form["Enter_Link"]
                input_text = scrape_web_data(link)
                df2 = pd.DataFrame()
                df2["input_text"] = [input_text] 
                df2["Predicted_Summaries"] = [predict_summaries(input_text)]
                df2.drop("input_text", axis =1, inplace = True)
            
                sub_q = "Who is the subject?"
                obj_q = "Who is the object?"

                df2['Subject_Predicted'] = df2['Predicted_Summaries'].apply(lambda x: answer(x, sub_q))
                df2['Object_Predicted'] = df2['Predicted_Summaries'].apply(lambda x: answer(x, obj_q))

                html = df2.to_html() 
                text_file = open("./templates/results.html", "w", encoding = "utf8") 
                text_file.write(html) 
                text_file.close() 
#                 return render_template("results.html")
            except:
                return "Please enter the correct link!"
        else:
            try:
                filename = request.files['file']
                data = pd.read_excel(filename)
                links = data["Link"]
                dict_links = {}
                for link in links:
                    if(link not in dict_links):
                        dict_links[link] = scrape_web_data(link)
                    else:
                        dict_links[link] = 0
                
                df2 = pd.DataFrame()
                df2["input_text"] = [v for k, v in dict_links.items()]
                print("Web Scraping Done. Prediction Start!")
                summ = []
                for i, text in enumerate(df2["input_text"]):
                    summ.append(predict_summaries(text))
                    print("Done: {}".format(i))
                df2["Predicted_Summaries"] = summ
                df2.drop("input_text", axis =1, inplace = True)
                sub_q = "Who is the subject?"
                obj_q = "Who is the object?"

                df2['Subject_Predicted'] = df2['Predicted_Summaries'].apply(lambda x: answer(x, sub_q))
                df2['Object_Predicted'] = df2['Predicted_Summaries'].apply(lambda x: answer(x, obj_q))

                ## Everything will be written to a html file
                html = df2.to_html() 
                text_file = open("./templates/results.html", "w", encoding = "utf8") 
                text_file.write(html) 
                text_file.close() 
#                 return render_template("results.html")
            except:
                return "Either the input link is incorrect or the column name is incorrect!"
    else:
        return render_template("index.html")


@app.route("/result", methods = ['GET', "POST"])
def result():
    return render_template("results.html")

if __name__=="__main__":
    app.run(debug = True)

Overwriting app.py
