1) data extraction

In [134]:
import requests
from bs4 import BeautifulSoup

def extract_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

2) data cleaning

In [135]:
import nltk
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')         # For tokenization
nltk.download('stopwords')     # For stopwords
nltk.download('wordnet')       # For lemmatization
nltk.download('omw-1.4')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_content(text):
    text = text.lower() #  Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)     # Remove URLs

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove non-alphanumeric characters (punctuation, special characters)
    tokens = word_tokenize(text, language='english', preserve_line=True)    # Tokenize the text

    stop_words = set(stopwords.words('english'))        # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]    # Lemmatize tokens
    
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


3) 

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_text(texts):
    vectorizer = TfidfVectorizer() # converts text doc to matrix
    return vectorizer.fit_transform(texts)

In [137]:
from sklearn.cluster import KMeans

def cluster_keywords(features, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(features)
    return kmeans.labels_


In [138]:
from sklearn.feature_extraction.text import TfidfVectorizer

def generate_important_recommendations(user_keywords, competitor_keywords):
    # Join keywords into documents (texts) for TF-IDF processing
    user_text = " ".join(user_keywords)     # list -> 1 string -> 1 doc
    competitor_text = " ".join(competitor_keywords)
    
    # Use TF-IDF Vectorizer to get importance scores
    vectorizer = TfidfVectorizer()      # creating a vectorizer obj
    tfidf_matrix = vectorizer.fit_transform([user_text, competitor_text])       # combines the strings -> 
    print(tfidf_matrix)     # fit leanrs stats that is need for transforming the data,      # transform applies TF-IDF on the data
    
    # Extract TF-IDF scores for the competitor's keywords
    feature_names = vectorizer.get_feature_names_out()          # get the list of unique features learnt by the vectorizer obj durting fit
    competitor_tfidf_scores = tfidf_matrix[1].toarray()[0]  # Competitor's TF-IDF row
    # torray give [[all the TF-IDF vals]]
    
    # Create a dictionary of words and their TF-IDF scores for the competitor
    tfidf_scores = {feature_names[i]: competitor_tfidf_scores[i] for i in range(len(feature_names))}
    
    # Sort words by TF-IDF score in descending order (high to low importance)
    important_keywords = sorted(tfidf_scores, key=tfidf_scores.get, reverse=True)
    
    # Generate recommendations for high-importance keywords missing from user content
    recommendations = []
    for keyword in important_keywords:
        if keyword not in user_keywords:
            recommendations.append(f"Consider adding more content about '{keyword}' to improve relevance.")
    
    return recommendations



In [139]:
def main():
    user_url = "https://en.wikipedia.org/wiki/Health"
    user_content = extract_content(user_url)
    user_keywords = preprocess_content(user_content)

    # Example competitor keywords for testing
    competitor_urls = ["https://www.wikihow.com/Be-Healthy#:~:text=Things%20You%20Should%20Know,based%20on%20your%20physical%20frame).", "https://www.wikihow.com/Category:Health"]
    competitor_contents = [extract_content(url) for url in competitor_urls]
    
    competitor_keywords = []
    for content in competitor_contents:
        competitor_keywords.extend(preprocess_content(content))
    
    # Generate recommendations
    recommendations = generate_important_recommendations(user_keywords, competitor_keywords)
    
    # Display recommendations
    print("Recommendations:")
    for recommendation in recommendations:
        print(recommendation)

if __name__ == "__main__": 
    main()


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4084 stored elements and shape (2, 3528)>
  Coords	Values
  (0, 1517)	0.7969028813707069
  (0, 3465)	0.020438306465645772
  (0, 1804)	0.0029084046765354265
  (0, 809)	0.00872521402960628
  (0, 1977)	0.06131491939693732
  (0, 2057)	0.008175322586258308
  (0, 2118)	0.016350645172516616
  (0, 2859)	0.016350645172516616
  (0, 1557)	0.011633618706141706
  (0, 2148)	0.004087661293129154
  (0, 2299)	0.004087661293129154
  (0, 1206)	0.004087661293129154
  (0, 381)	0.004087661293129154
  (0, 3466)	0.004087661293129154
  (0, 819)	0.014542023382677132
  (0, 1551)	0.004087661293129154
  (0, 1104)	0.004087661293129154
  (0, 2425)	0.004087661293129154
  (0, 656)	0.008175322586258308
  (0, 1299)	0.004087661293129154
  (0, 2780)	0.005816809353070853
  (0, 1058)	0.0029084046765354265
  (0, 346)	0.008175322586258308
  (0, 864)	0.005816809353070853
  (0, 216)	0.00872521402960628
  :	:
  (1, 649)	0.0038587698074552933
  (1, 2054)	0.003858769807