<a href="https://colab.research.google.com/github/scottzakrzewski/samples/blob/main/Director_of_SEO_iPullRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from flask import Flask, render_template, request, jsonify
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

app = Flask(__name__)

# Replace with your actual Google Custom Search Engine ID
CSE_ID = "YOUR_CSE_ID"
API_KEY = "YOUR_API_KEY"

# Load pre-trained sentiment analysis model (e.g., BERT)
model_name = "bert-base-uncased-finetuned-sst-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load Hugging Face embeddings for semantic search
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load OpenAI LLM (replace with your actual API key)
llm = OpenAI(model_name="text-davinci-003", api_key="YOUR_OPENAI_API_KEY")

def google_search(search_term):
    """
    Performs a Google Custom Search and returns the results.
    """
    service = build("customsearch", "v1", developerKey=API_KEY)
    res = service.cse().list(
        q=search_term,
        cx=CSE_ID,
    ).execute()
    return res['items']

def get_domain_authority(domain):
    """
    Fetches Domain Authority (DA) using an external API (replace with your preferred API).
    """
    try:
        api_key = "YOUR_EXTERNAL_API_KEY"  # Replace with your actual API key
        url = f"https://api.example.com/da?domain={domain}&api_key={api_key}"
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        return data['da']
    except (requests.exceptions.RequestException, KeyError, ValueError) as e:
        print(f"Error fetching DA for {domain}: {e}")
        return None

def get_page_title(url):
    """
    Fetches page title from the given URL.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.find('title').text.strip()
        return title
    except (requests.exceptions.RequestException, AttributeError) as e:
        print(f"Error fetching page title for {url}: {e}")
        return None

def get_page_content(url):
    """
    Fetches and extracts relevant text content from the given URL.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract relevant content (e.g., paragraphs, headings)
        text = ' '.join([p.get_text() for p in soup.find_all('p')])
        return text
    except (requests.exceptions.RequestException, AttributeError) as e:
        print(f"Error fetching page content for {url}: {e}")
        return None

def calculate_content_similarity(search_term, page_content):
    """
    Calculates the similarity between the search term and the page content using TF-IDF.
    """
    corpus = [search_term, page_content]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity_score

def predict_sentiment(text):
    """
    Predicts the sentiment of the given text using a pre-trained sentiment analysis model.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_label_idx = torch.argmax(logits, dim=1).item()
    labels = ["negative", "positive"]  # Assuming a binary sentiment classification model
    predicted_label = labels[predicted_label_idx]
    return predicted_label

def create_vectorstore(documents):
    """
    Creates a FAISS vectorstore for efficient semantic search.
    """
    return FAISS.from_documents(documents, embeddings)

def get_relevant_documents(vectorstore, query):
    """
    Retrieves relevant documents from the vectorstore based on the query.
    """
    docs = vectorstore.similarity_search(query, k=3)
    return docs

def generate_answer(query, relevant_documents):
    """
    Generates an answer to the query using an LLM and relevant documents.
    """
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    answer = qa_chain.run(query)
    return answer

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        search_term = request.form['search']
        results = google_search(search_term)

        # Enhance results with additional data
        for result in results:
            domain = re.match(r'^(?:http|https)://[^/]+', result['link']).group(0)
            result['da'] = get_domain_authority(domain)
            result['title'] = get_page_title(result['link'])
            result['content'] = get_page_content(result['link'])
            result['similarity'] = calculate_content_similarity(search_term, result['content'])
            result['sentiment'] = predict_sentiment(result['content'])

        # Create a list of documents for vectorstore
        documents = [
            {"text": result['content'], "metadata": {"title": result['title'], "url": result['link']}}
            for result in results
        ]

        # Create vectorstore
        vectorstore = create_vectorstore(documents)

        # Get relevant documents for the search term
        relevant_docs = get_relevant_documents(vectorstore, search_term)

        # Generate an answer using LLM and relevant documents
        answer = generate_answer(search_term, relevant_docs)

        return render_template('results.html', results=results, answer=answer)
    return render_template('index.html')

if __name__ == '__main__':
    app.run(debug=True)