# problem Statement:
### Design and implement a prototype of an AI-powered summarization bot that can generate concise summaries of startup applications for investors' review.

In [4]:
import dash
from dash import dcc, html, Input, Output, State
from dash.dependencies import Input, Output
from transformers import pipeline
import pdfplumber
import base64
import io
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from docx import Document
nltk.download('stopwords')
nltk.download('punkt')

app = dash.Dash(__name__)

# Specify the model and revision
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

app.layout = html.Div([
    html.H1("AI-Powered Startup Application Summarizer"),
    dcc.Upload(
        id='upload-data',
        children=html.Div([
            'Drag and Drop or ',
            html.A('Select a Startup Application Document')
        ]),
        style={
            'width': '50%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        # Allow multiple files to be uploaded
        multiple=False
    ),
    dcc.Textarea(id='summary-output', readOnly=True),
])

# Callback to generate and display the summary
@app.callback(
    Output('summary-output', 'value'),
    Input('upload-data', 'contents'),
    State('upload-data', 'filename')
)
def generate_summary(contents, filename):
    if contents is not None:
        document_text = extract_text_from_document(contents, filename)
        # Preprocess the text before summarization
        preprocessed_text = preprocess_text(document_text)
        summary = summarizer(preprocessed_text, max_length=150, min_length=30, do_sample=False)
        return summary[0]['summary_text']

def extract_text_from_document(contents, filename):
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    
    if filename.endswith('.pdf'):
        # Handle PDF documents
        with pdfplumber.open(io.BytesIO(decoded)) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
    elif filename.endswith('.docx'):
        # Handle Word documents
        doc = Document(io.BytesIO(decoded))
        text = ''
        for paragraph in doc.paragraphs:
            text += paragraph.text
    else:
        # You can add support for other document formats here
        return "Unsupported document format"

    return text

def preprocess_text(text):
    # Perform NLP preprocessing steps
    text = re.sub(r'\n', ' ', text)  # Replace newline characters with spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert text to lowercase
    text = remove_punctuation(text)  # Remove punctuation
    text = remove_urls(text)  # Remove URLs
    return text

def remove_punctuation(text):
    # Remove punctuation using a regular expression
    return re.sub(f"[{re.escape(string.punctuation)}]", " ", text)

def remove_urls(text):
    # Remove website URLs using a regular expression
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text_without_urls = re.sub(url_pattern, ' ', text)
    return text_without_urls

if __name__ == '__main__':
    app.run_server(debug=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
