<a href="https://colab.research.google.com/github/stillrahim/jupyter-exploration/blob/main/L04_IbrahimBah_ITAI_2373.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# L04_IbrahimBah_ITAI_2373.ipynb
# Text Representation Lab

# =====================
# Setup
# =====================
!pip install scikit-learn gensim --quiet

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec

# Example dataset (you can replace with your own)
texts = [
    "The dog barked at the mailman",
    "The cat meowed loudly",
    "The mailman delivered the package",
    "Dogs and cats are great pets",
    "I love my pet dog",
    "Cats are very independent animals",
]
labels = [0, 1, 0, 2, 0, 1]
# 0 = dog, 1 = cat, 2 = mailman (for demo purposes)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# =====================
# Bag of Words
# =====================
print("=== Bag of Words ===")
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train_bow, y_train)
y_pred_bow = clf_bow.predict(X_test_bow)
print("Accuracy (BOW):", accuracy_score(y_test, y_pred_bow))
print(classification_report(y_test, y_pred_bow))

# =====================
# TF-IDF
# =====================
print("\n=== TF-IDF ===")
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))

# =====================
# N-grams
# =====================
print("\n=== N-grams (Bigrams) ===")
ngram_vectorizer = CountVectorizer(ngram_range=(1,2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

clf_ngram = LogisticRegression(max_iter=1000)
clf_ngram.fit(X_train_ngram, y_train)
y_pred_ngram = clf_ngram.predict(X_test_ngram)
print("Accuracy (N-grams):", accuracy_score(y_test, y_pred_ngram))
print(classification_report(y_test, y_pred_ngram))

# =====================
# Word Embeddings (Word2Vec)
# =====================
print("\n=== Word2Vec ===")

# Tokenize for Word2Vec
sentences = [t.lower().split() for t in texts]
w2v_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

def get_avg_vector(text, model):
    words = text.lower().split()
    vectors = [model.wv[w] for w in words if w in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_w2v = np.array([get_avg_vector(t, w2v_model) for t in X_train])
X_test_w2v = np.array([get_avg_vector(t, w2v_model) for t in X_test])

clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_test_w2v)
print("Accuracy (Word2Vec):", accuracy_score(y_test, y_pred_w2v))
print(classification_report(y_test, y_pred_w2v))

# =====================
# Reflection (Markdown cells in Colab)
# =====================


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompa

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject