In [None]:
# import 


import nltk
from nltk.corpus import movie_reviews
from gensim.models import Word2Vec
import numpy as np
from flask import Flask, request, jsonify, session
from flask_session import Session
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import threading
import time
import requests
import logging
nltk.download('movie_reviews')


In [None]:
# load the movie reviews dataset
documents = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

# preprocess documents
tokenized_docs = [doc.lower().split() for doc in documents]

# train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# generate embeddings for each document
document_embeddings = []
for doc in tokenized_docs:
    doc_embedding = np.mean([w2v_model.wv[word] for word in doc if word in w2v_model.wv], axis=0)
    document_embeddings.append(doc_embedding)
document_embeddings = np.array(document_embeddings)

# document database
vector_db = {i: doc for i, doc in enumerate(documents)}

In [None]:
# configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', handlers=[
    logging.FileHandler("flask_app.log"),
    logging.StreamHandler()
])

# initialize the flash app
app = Flask(__name__)
app.config['SECRET_KEY'] = 'secretkey'
app.config['SESSION_TYPE'] = 'filesystem'
Session(app)

# initialize the text generation pipeline, the LLM
text_generator = pipeline("text2text-generation", model="google/flan-t5-large")

# function to get the embedding of a query
def get_embedding(text):
    words = text.lower().split()
    embedding = np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv], axis=0)
    return embedding.reshape(1,-1)

@app.route('/chat', methods=['POST'])
def chat():
    user_message = request.json.get('message')
    logging.info(f"Received message: {user_message}")

    if 'conversation' not in session:
        session['conversation'] = []
    
    # store user message in the session
    session['conversation'].append(user_message)

    # convert query to embedding
    query_embedding = get_embedding(user_message)

    # compute cosine similarity between query and document embeddings
    similarities = cosine_similarity(query_embedding, document_embeddings)

    # retrieve the most similar documents
    top_k_indices = np.argsort(similarities[0])[-5:][::-1]
    retrieved_docs = [vector_db[idx] for idx in top_k_indices]

    # concatenate retrieved documents and previous conversation
    input_text = " ".join(retrieved_docs) + " " + " ".join(session['conversatioin'])

    # generate response
    response = text_generator(input_text)
    generated_text = response[0]['generated_text']
    logging.info(f"Generated response: {generated_text}")

    # store bot response in the session
    session['conversation'].append(generated_text)

    return jsonify({'response': generated_text})

@app.route('/reset', methods=['POST'])
def reset():
    session.pop('conversation', None)
    logging.info("Conversation reset.")
    return jsonify({'message': 'Conversation reset.'})

# function to run flask app
def run_app():
    app.run(port=5000, debug=False)



