In [33]:
import pandas as pd
import re

In [47]:
import warnings
warnings.filterwarnings('ignore')

# Data loading and basic preprocessing
* removing special characters, extra spaces, and converting to lowercase

In [60]:
def load_raw_data(file_path: str) -> pd.DataFrame:
    """Load the raw dataset."""
    df = pd.read_excel(file_path)
    return df

def clean_text(text: str) -> str:
    """Clean text by removing special characters, stopwords, and lemmatizing."""
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        words = [lemmatizer.lemmatize(word.lower()) for word in text.split() if word.lower() not in stop_words]
        return ' '.join(words)
    return text

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the dataset: clean specific columns and handle missing values."""
    for col in ['Sanskrit Anuvad', 'Hindi Anuvad', 'English Translation', 'Explanation']:
        if col in df.columns:
            df[col] = df[col].fillna("").apply(clean_text)  # Replace NaN with an empty string
    return df

def save_processed_data(df: pd.DataFrame, file_path: str):
    """Save the processed dataset to a file."""
    df.to_csv(file_path, index=False)

if __name__ == "__main__":
    # File paths
    input_file = "data/raw/verses_original.csv"
    output_file = "data/processed/verses_processed.csv"

    # Data processing steps
    raw_data = load_raw_data('data/raw/temp_dataset.xlsx')
    processed_data = preprocess_data(raw_data)
    save_processed_data(processed_data, output_file)
    # print(f"Processed data saved to {output_file}")


NameError: name 'stop_words' is not defined

In [None]:
processed_data.head()

# generating embeddings for content-nased filetering using TF-IDF

In [67]:

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

def load_processed_data(file_path: str) -> pd.DataFrame:
    """Load the processed dataset."""
    return pd.read_csv(file_path)

def generate_tfidf_embeddings(df: pd.DataFrame, column: str) -> np.ndarray:
    """Generate TF-IDF embeddings for a specific column."""
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_df=0.8,   # Ignore terms that appear in more than 80% of documents
        min_df=2,     # Ignore terms that appear in fewer than 2 documents
        ngram_range=(1, 2)  # Use unigrams and bigrams
    )
    embeddings = vectorizer.fit_transform(df[column].fillna(""))
    return embeddings, vectorizer

def generate_embeddings(text_list):
    return model.encode(text_list, convert_to_tensor=True)

def save_embeddings(embeddings, model, embeddings_file: str, model_file: str):
    """Save the embeddings and TF-IDF model."""
    np.save(embeddings_file, embeddings.toarray())
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)

if __name__ == "__main__":
    # File paths
    input_file = "data/processed/verses_processed.csv"
    embeddings_file = "models/content_based/verse_embeddings.npy"
    model_file = "models/content_based/tfidf_model.pkl"

    # Load processed data
    processed_data = load_processed_data(input_file)

    # Generate embeddings
    embeddings, tfidf_model = generate_embeddings(processed_data, 'Explanation')

    # Save embeddings and model
    save_embeddings(embeddings, tfidf_model, embeddings_file, model_file)
    print(f"Embeddings saved to {embeddings_file}")
    print(f"TF-IDF model saved to {model_file}")


TypeError: generate_embeddings() takes 1 positional argument but 2 were given

# content based

In [65]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity

def load_embeddings(embeddings_file: str, model_file: str):
    """Load pre-saved embeddings and the TF-IDF model."""
    embeddings = np.load(embeddings_file)
    with open(model_file, 'rb') as f:
        model = pickle.load(f)
    return embeddings, model

def find_similar_verses(user_input: str, tfidf_model, embeddings, data_file: str, top_n: int = 5):
    """Find top-N similar verses to the user's input."""
    # Load the processed dataset
    data = pd.read_csv(data_file)
    
    # Combine multiple fields for similarity (e.g., Explanation + Keywords Tags)
    combined_texts = data['Explanation'] + " " + data['Keywords Tags']
    embeddings = tfidf_model.fit_transform(combined_texts.fillna(""))

    # Transform user input
    user_vector = tfidf_model.transform([user_input])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(user_vector, embeddings).flatten()

    # Get top-N similar verses
    top_indices = similarities.argsort()[-top_n:][::-1]
    recommendations = data.iloc[top_indices]
    recommendations['similarity_score'] = similarities[top_indices]

    return recommendations[['Title', 'Chapter', 'Verse', 'Explanation', 'similarity_score']]

if __name__ == "__main__":
    # File paths
    embeddings_file = "models/content_based/verse_embeddings.npy"
    model_file = "models/content_based/tfidf_model.pkl"
    data_file = "data/processed/verses_processed.csv"
    
    # User input
    # user_input = "I am facing moral dilemmas in my decisions at work."
    user_input = "I am not feeling happy because of personal relationship issues"
    
    # Load embeddings and TF-IDF model
    embeddings, tfidf_model = load_embeddings(embeddings_file, model_file)
    
    # Get recommendations
    recommendations = find_similar_verses(user_input, tfidf_model, embeddings, data_file)
    print(recommendations)


                    Title    Chapter       Verse  \
18  Arjuna's Vishada Yoga  Chapter 1  Verse 1.19   
8   Arjuna's Vishada Yoga  Chapter 1   Verse 1.9   
1   Arjuna's Vishada Yoga  Chapter 1   Verse 1.2   
2   Arjuna's Vishada Yoga  Chapter 1   Verse 1.3   
3   Arjuna's Vishada Yoga  Chapter 1   Verse 1.4   

                                          Explanation  similarity_score  
18                                                NaN               0.0  
8   duryodhana emphasizes that many other brave wa...               0.0  
1   sanjay describes how duryodhana upon seeing th...               0.0  
2   duryodhana points out to dronacharya the great...               0.0  
3   duryodhana highlights the presence of great wa...               0.0  


In [66]:
recommendations.head()

Unnamed: 0,Title,Chapter,Verse,Explanation,similarity_score
18,Arjuna's Vishada Yoga,Chapter 1,Verse 1.19,,0.0
8,Arjuna's Vishada Yoga,Chapter 1,Verse 1.9,duryodhana emphasizes that many other brave wa...,0.0
1,Arjuna's Vishada Yoga,Chapter 1,Verse 1.2,sanjay describes how duryodhana upon seeing th...,0.0
2,Arjuna's Vishada Yoga,Chapter 1,Verse 1.3,duryodhana points out to dronacharya the great...,0.0
3,Arjuna's Vishada Yoga,Chapter 1,Verse 1.4,duryodhana highlights the presence of great wa...,0.0
