In [1]:
pip install requests beautifulsoup4 pandas openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   ------------------- -------------------- 122.9/250.9 kB 3.6 MB/s eta 0:00:01
   ---------------------------------------  245.8/250.9 kB 5.0 MB/s eta 0:00:01
   ---------------------------------------- 250.9/250.9 kB 2.2 MB/s eta 0:00:00
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# This is to create an excel file.
"""
import pandas as pd

data = {
    "Website_URLs": [
        "https://www.example.com",
        "https://www.wikipedia.org",
        "https://www.python.org"
    ]
}

excel_file_path = "website_urls.xlsx"
df = pd.DataFrame(data)
df.to_excel(excel_file_path, index=False)
print(f"Excel file with URLs created at: {excel_file_path}")
"""

'\nimport pandas as pd\n\ndata = {\n    "Website_URLs": [\n        "https://www.example.com",\n        "https://www.wikipedia.org",\n        "https://www.python.org"\n    ]\n}\n\nexcel_file_path = "website_urls.xlsx"\ndf = pd.DataFrame(data)\ndf.to_excel(excel_file_path, index=False)\nprint(f"Excel file with URLs created at: {excel_file_path}")\n'

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to fetch and clean data
def fetch_and_clean_data(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, "html.parser")
        cleaned_text = soup.get_text(separator=" ", strip=True)
        return cleaned_text
    except Exception as e:
        return f"Error fetching URL {url}: {e}"

# Read URLs from the Excel file
input_file = "website_urls.xlsx"
df = pd.read_excel(input_file)

# Process each URL
cleaned_data = []
for url in df["Website_URLs"]:
    cleaned_data.append(fetch_and_clean_data(url))

# Save the cleaned data to a new Excel file
df["Cleaned_Text"] = cleaned_data
output_file = "cleaned_website_data.xlsx"
df.to_excel(output_file, index=False)
print(f"Cleaned data saved at: {output_file}")


Cleaned data saved at: cleaned_website_data.xlsx


In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Load the Excel file
file_path = "cleaned_website_data.xlsx"  # Replace with your actual file path
df = pd.read_excel(file_path)

In [5]:
# Define the preprocessing function
def preprocess_text(raw_text):
    try:
        # Step 1: Remove HTML tags and scripts
        soup = BeautifulSoup(raw_text, "html.parser")
        cleaned_text = soup.get_text(separator=" ", strip=True)

        # Step 2: Normalize whitespace
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

        # Step 3: Remove special characters
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)

        # Step 4: Convert to lowercase
        cleaned_text = cleaned_text.lower()

        # Step 5 (Optional): Tokenization (if needed, uncomment below)
        # tokens = cleaned_text.split()

        # Step 6 (Optional): Remove stopwords (if needed, uncomment below)
        # from nltk.corpus import stopwords
        # stop_words = set(stopwords.words('english'))
        # tokens = [word for word in tokens if word not in stop_words]
        # cleaned_text = " ".join(tokens)

        return cleaned_text

    except Exception as e:
        return f"Error processing text: {e}"

In [10]:
# Apply preprocessing to the "Cleaned_Text" column
processed_df = df[['Cleaned_Text']].copy()
processed_df['Processed_Text'] = processed_df['Cleaned_Text'].apply(preprocess_text)

# Save the processed data to a separate Excel file
output_file = "processed_website_data.xlsx"
processed_df.to_excel(output_file, index=False)
print(f"Processed data saved to: {output_file}")


Processed data saved to: processed_website_data.xlsx


  soup = BeautifulSoup(raw_text, "html.parser")


In [6]:
### 1. Generate Embeddings

In [7]:
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load processed data
file_path = "processed_website_data.xlsx"
df = pd.read_excel(file_path)

In [8]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
df['Embeddings'] = df['Processed_Text'].apply(lambda text: model.encode(text).tolist())

# Save embeddings
df.to_pickle("processed_data_with_embeddings.pkl")



In [11]:
import faiss
import numpy as np

In [12]:
# Load embeddings
df = pd.read_pickle("processed_data_with_embeddings.pkl")
embeddings = np.array(df['Embeddings'].tolist(), dtype='float32')

In [13]:
# Create and save FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance metric
index.add(embeddings)
faiss.write_index(index, "semantic_search_index.faiss")

In [14]:
# Generate query embedding
query = "Latest cricket news"
query_embedding = model.encode(query).reshape(1, -1)

In [16]:
# Search the index
D, I = index.search(query_embedding, k=5)  # Retrieve top 5 results
results = df.iloc[I[0]]
print(results[['Processed_Text']])

                                       Processed_Text
21  todays cricket match  cricket update  cricket ...
0   live cricket score schedule latest news stats ...
15  sporting news india  nba  cricket  football  t...
10  bbc sport  scores fixtures news  live sport bb...
8   the athletic uk  sports news commentary result...


#### Load a Pre-Trained NER Model

In [18]:
from transformers import pipeline

# Load pre-trained NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Example text for testing
sample_text = "Sachin Tendulkar played for India and scored 100 centuries in international cricket."

# Perform NER
entities = ner_pipeline(sample_text)
print(entities)




config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-PER', 'score': 0.999526, 'index': 1, 'word': 'Sa', 'start': 0, 'end': 2}, {'entity': 'I-PER', 'score': 0.99944085, 'index': 2, 'word': '##chin', 'start': 2, 'end': 6}, {'entity': 'I-PER', 'score': 0.9989986, 'index': 3, 'word': 'Ten', 'start': 7, 'end': 10}, {'entity': 'I-PER', 'score': 0.9954986, 'index': 4, 'word': '##du', 'start': 10, 'end': 12}, {'entity': 'I-PER', 'score': 0.97261775, 'index': 5, 'word': '##lk', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.99505556, 'index': 6, 'word': '##ar', 'start': 14, 'end': 16}, {'entity': 'I-LOC', 'score': 0.99975616, 'index': 9, 'word': 'India', 'start': 28, 'end': 33}]


In [19]:
import pandas as pd

# Load the processed data
file_path = "processed_website_data.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)

# Apply NER to each row
df['Entities'] = df['Processed_Text'].apply(lambda text: ner_pipeline(text))

# Save the results to a new file
df.to_excel("ner_results.xlsx", index=False)
print("NER results saved to 'ner_results.xlsx'")


NER results saved to 'ner_results.xlsx'


#### Code to Check Output After User Input

In [20]:
import pandas as pd

# Step 1: Load the NER results file
file_path = "ner_results.xlsx"  # Replace with your actual file path
df = pd.read_excel(file_path)


In [21]:
# Step 2: Function to handle user queries
def process_query(query):
    # Search for rows containing the query
    results = df[df['Processed_Text'].str.contains(query, case=False, na=False)]
    
    if results.empty:
        print("\nNo results found for your query.")
    else:
        # Display matched text and entities
        print("\nMatched Results:")
        for idx, row in results.iterrows():
            print(f"Text: {row['Processed_Text']}")
            print("Entities:")
            for entity in eval(row['Entities']):  # Convert string to list
                print(f"  - Entity: {entity['word']}, Type: {entity['entity']}, Confidence: {entity['score']:.2f}")
            print("-" * 50)

In [None]:
# Step 3: Interactive loop for user input
while True:
    user_query = input("\nEnter your query (or type 'exit' to quit): ").strip()
    if user_query.lower() == "exit":
        print("Exiting...")
        break
    process_query(user_query)