<a href="https://colab.research.google.com/github/sagar-maheshwari01/python.assignment/blob/main/SVM_and_NAVIE_BAYES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

def run_word_vectorization():
    """
    Demonstrates converting text documents into numerical word vectors
    using Bag-of-Words (Count Vectorization) and TF-IDF.
    This code is suitable for execution in Google Colab.
    """
    print("Starting word vectorization demonstration...")

    # Sample text documents
    # These documents represent a small corpus of text.
    documents = [
        "The quick brown fox jumps over the lazy dog.",
        "Never jump over a lazy dog again.",
        "The dog is brown and quick.",
        "A quick brown fox.",
        "The fox is quick."
    ]
    print("\n--- Original Documents ---")
    for i, doc in enumerate(documents):
        print(f"Document {i+1}: {doc}")

    # --- 1. Bag-of-Words (Count Vectorization) ---
    # This method counts the occurrences of each word in each document.
    # It creates a matrix where rows are documents and columns are unique words,
    # and cell values are the word counts.

    print("\n--- Bag-of-Words (Count Vectorization) ---")
    # Initialize CountVectorizer
    # stop_words='english' removes common English words (like 'the', 'is')
    # that often don't carry much meaning.
    # lowercase=True converts all text to lowercase.
    count_vectorizer = CountVectorizer(stop_words='english', lowercase=True)

    # Fit and transform the documents
    # fit_transform learns the vocabulary from the documents and then
    # transforms them into a document-term matrix.
    count_matrix = count_vectorizer.fit_transform(documents)

    # Get feature names (the unique words in the vocabulary)
    feature_names_count = count_vectorizer.get_feature_names_out()

    # Convert the count matrix to a DataFrame for better readability
    df_count = pd.DataFrame(count_matrix.toarray(), columns=feature_names_count)
    df_count.index = [f"Document {i+1}" for i in range(len(documents))]
    print("Vocabulary (unique words):", feature_names_count)
    print("\nBag-of-Words Matrix:")
    print(df_count)

    # --- 2. TF-IDF (Term Frequency-Inverse Document Frequency) ---
    # TF-IDF reflects how important a word is to a document in a corpus.
    # It's a product of two terms:
    #   - Term Frequency (TF): How frequently a word appears in a document.
    #   - Inverse Document Frequency (IDF): How rare or common a word is across all documents.
    # Words that are common in many documents (like 'the') get a lower IDF score,
    # thus reducing their overall TF-IDF weight.

    print("\n--- TF-IDF (Term Frequency-Inverse Document Frequency) ---")
    # Initialize TfidfVectorizer
    # Again, stop_words='english' and lowercase=True are used for preprocessing.
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

    # Fit and transform the documents
    # fit_transform learns the vocabulary and IDF values, then transforms
    # the documents into a TF-IDF weighted document-term matrix.
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    # Get feature names (the unique words in the vocabulary)
    feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

    # Convert the TF-IDF matrix to a DataFrame for better readability
    df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names_tfidf)
    df_tfidf.index = [f"Document {i+1}" for i in range(len(documents))]
    print("Vocabulary (unique words):", feature_names_tfidf)
    print("\nTF-IDF Matrix:")
    print(df_tfidf)

    print("\nWord vectorization demonstration completed.")

# To run this code in Google Colab, simply execute this cell.
# The function call below will start the process.
if __name__ == '__main__':
    run_word_vectorization()