<a href="https://colab.research.google.com/github/sohv/NLP-Lab/blob/main/Lab_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hidden Markov Models

## Cross-domain classification using HMM

In [1]:
import numpy as np
import nltk
from nltk.tag import hmm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups

def load_data():
    categories_tech = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
    categories_health = ['sci.med']

    tech_data = fetch_20newsgroups(subset='train', categories=categories_tech, remove=('headers', 'footers', 'quotes'))
    health_data = fetch_20newsgroups(subset='test', categories=categories_health, remove=('headers', 'footers', 'quotes'))

    return tech_data.data, tech_data.target, health_data.data, health_data.target

def preprocess_data(train_texts, test_texts):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

def train_hmm(X_train, y_train):
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    model = MultinomialNB()
    model.fit(X_train, y_train_encoded)
    return model, le

def evaluate_model(model, X_test, y_test, le):
    y_pred = model.predict(X_test)
    y_pred_labels = le.inverse_transform(y_pred)
    accuracy = accuracy_score(y_test, y_pred_labels)
    print(f"Cross-Domain Classification Accuracy: {accuracy:.2f}")

def main():
    train_texts, train_labels, test_texts, test_labels = load_data()
    X_train, X_test, vectorizer = preprocess_data(train_texts, test_texts)
    model, le = train_hmm(X_train, train_labels)
    evaluate_model(model, X_test, test_labels, le)

if __name__ == "__main__":
    main()

Cross-Domain Classification Accuracy: 0.40


## Hybrid HMM-Naive Bayes for text classification

In [2]:
import numpy as np
import nltk
from nltk.tag import hmm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups

def load_data():
    categories = ['comp.graphics', 'sci.med']
    data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
    return data.data, data.target

def preprocess_data(texts):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

def train_hmm(X_train, y_train):
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    hmm_model = MultinomialNB()
    hmm_model.fit(X_train, y_train_encoded)
    return hmm_model, le

def extract_hmm_features(hmm_model, X):
    return hmm_model.predict_proba(X)

def train_naive_bayes(X_train, y_train):
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    return nb_model

def evaluate_model(model, X_test, y_test, le, model_name):
    y_pred = model.predict(X_test)
    y_pred_labels = le.inverse_transform(y_pred)
    accuracy = accuracy_score(y_test, y_pred_labels)
    print(f"{model_name} Accuracy: {accuracy:.2f}")

def main():
    texts, labels = load_data()
    X, vectorizer = preprocess_data(texts)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

    hmm_model, le = train_hmm(X_train, y_train)
    evaluate_model(hmm_model, X_test, y_test, le, "Standalone HMM")

    hmm_features_train = extract_hmm_features(hmm_model, X_train)
    hmm_features_test = extract_hmm_features(hmm_model, X_test)

    nb_model = train_naive_bayes(hmm_features_train, y_train)
    evaluate_model(nb_model, hmm_features_test, y_test, le, "Hybrid HMM-Naïve Bayes")

if __name__ == "__main__":
    main()

Standalone HMM Accuracy: 0.96
Hybrid HMM-Naïve Bayes Accuracy: 0.96
