<a href="https://colab.research.google.com/github/sohv/NLP-Lab/blob/main/Lab_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hidden Markov Models

## Text-classification using HMM

In [3]:
import numpy as np  
from hmmlearn import hmm  
from sklearn.feature_extraction.text import CountVectorizer  

texts = ["I love programming", "Python is great", "I hate bugs", "Debugging is hard"]  
labels = [1, 1, 0, 0]  

vectorizer = CountVectorizer(binary=True)  
X = vectorizer.fit_transform(texts).toarray()  

lengths = [len(seq) for seq in X]

X_reshaped = np.concatenate(X).reshape(-1, 1)

model = hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=100)
model.fit(X_reshaped, [sum(lengths)]) 

# predict sentiment
preds = model.predict(X_reshaped).reshape(len(texts), -1)[:, 0] 
print(preds)

[0 0 1 0]


## Cross-domain classification using HMM

In [1]:
import numpy as np
import nltk
from nltk.tag import hmm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups

def load_data():
    categories_tech = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
    categories_health = ['sci.med']

    tech_data = fetch_20newsgroups(subset='train', categories=categories_tech, remove=('headers', 'footers', 'quotes'))
    health_data = fetch_20newsgroups(subset='test', categories=categories_health, remove=('headers', 'footers', 'quotes'))

    return tech_data.data, tech_data.target, health_data.data, health_data.target

def preprocess_data(train_texts, test_texts):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

def train_hmm(X_train, y_train):
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    model = MultinomialNB()
    model.fit(X_train, y_train_encoded)
    return model, le

def evaluate_model(model, X_test, y_test, le):
    y_pred = model.predict(X_test)
    y_pred_labels = le.inverse_transform(y_pred)
    accuracy = accuracy_score(y_test, y_pred_labels)
    print(f"Cross-Domain Classification Accuracy: {accuracy:.2f}")

def main():
    train_texts, train_labels, test_texts, test_labels = load_data()
    X_train, X_test, vectorizer = preprocess_data(train_texts, test_texts)
    model, le = train_hmm(X_train, train_labels)
    evaluate_model(model, X_test, test_labels, le)

if __name__ == "__main__":
    main()

Cross-Domain Classification Accuracy: 0.40


In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from hmmlearn.hmm import GaussianHMM

# load tech and health documents
tech = fetch_20newsgroups(subset='train', categories=['comp.graphics'], remove=('headers', 'footers', 'quotes'))
health = fetch_20newsgroups(subset='test', categories=['sci.med'], remove=('headers', 'footers', 'quotes'))

# TF-IDF
vec = TfidfVectorizer(max_features=50)
X_all = vec.fit_transform(tech.data + health.data).toarray()
X_tech = X_all[:len(tech.data)]
X_health = X_all[len(tech.data):]

def repeat_seq(X, times=10):
    return np.vstack([np.tile(x, (times, 1)) for x in X]), [times] * len(X)

X_train, lengths = repeat_seq(X_tech, times=5)

model = GaussianHMM(n_components=5, covariance_type='diag', n_iter=100).fit(X_train, lengths)

scores = []
for x in X_health:
    x_rep = np.tile(x, (5, 1))
    try:
        scores.append(model.score(x_rep))
    except:
        scores.append(None)

valid_scores = [s for s in scores if s is not None]
print(f"HMM successfully scored {len(valid_scores)}/{len(scores)} health docs.")
#print("Sample scores:", valid_scores[:5])

HMM successfully scored 396/396 health docs.


## Hybrid HMM-Naive Bayes for text classification

In [4]:
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hmmlearn import hmm

nltk.download('punkt', quiet=True)
categories = ['comp.graphics', 'sci.med']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers','footers','quotes'))
texts = data.data
labels = data.target  

# Tokenize
tokens = [nltk.word_tokenize(doc.lower())[:30] for doc in texts]  # limit to 30 words
tokens = [t for t in tokens if len(t) >= 5]  # filter short ones
labels = [labels[i] for i in range(len(tokens))]  # align labels

le = LabelEncoder().fit(sum(tokens, []))
seqs = [le.transform([w for w in t if w in le.classes_]) for t in tokens]
lens = [len(s) for s in seqs]
X = np.concatenate(seqs).reshape(-1, 1)

model = hmm.GaussianHMM(n_components=5, n_iter=100).fit(X, lengths=lens)
posteriors = []
start = 0
for l in lens:
    probs = model.predict_proba(X[start:start+l])
    posteriors.append(probs.mean(axis=0)) 
    start += l

X_train, X_test, y_train, y_test = train_test_split(posteriors, labels, test_size=0.3, random_state=42)

nb = GaussianNB().fit(X_train, y_train)
hybrid_preds = nb.predict(X_test)

state_class_map = {}
for state in range(model.n_components):
    state_class_map[state] = max(set(y_train), key=lambda c: sum(
        1 for i, x in enumerate(X_train) if np.argmax(x) == state and y_train[i] == c
    ))

hmm_preds = [state_class_map[np.argmax(x)] for x in X_test]

print("Hybrid HMM+NaiveBayes accuracy:", accuracy_score(y_test, hybrid_preds))
print("Standalone HMM accuracy:", accuracy_score(y_test, hmm_preds))

Hybrid HMM+NaiveBayes accuracy: 0.5481049562682215
Standalone HMM accuracy: 0.5335276967930029
