To classify text as humorous (1) or non-humorous (0) using various word embedding techniques and compare how these embeddings affect model performance.

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...


True

In [7]:
# Load and Explore Dataset
df = pd.read_csv("humor_dataset.csv")
print(df.head())
print(df['label'].value_counts())

                                                text  label
0  Why do Java developers wear glasses? Because t...      1
1     I told my computer I needed a break… it froze.      1
2  Debugging is like being the detective in a cri...      1
3  Why did the neural network go to therapy? Too ...      1
4       My data went on a date… now it’s an outlier.      1
label
0    31
1    30
Name: count, dtype: int64


In [8]:
# Data Cleaning & Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(clean_text)

In [11]:
# Word Embedding Techniques

# (a) One Hot Encoding
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['clean_text'])

# (b) Bag of Words (BoW)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_text'])

# (c) Bag of N-grams (BoN)
vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(df['clean_text'])

# (d) TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])

In [16]:
# (e) Word2Vec (CBOW & Skip-Gram)\

from gensim.models import Word2Vec
sentences = [row.split() for row in df['clean_text']]
w2v_cbow = Word2Vec(sentences, vector_size=100, sg=0, min_count=1)
w2v_skip = Word2Vec(sentences, vector_size=100, sg=1, min_count=1)

In [17]:
# Convert text to sentence vectors (mean of word vectors):
def get_sentence_vector(model, sentence):
    words = [w for w in sentence.split() if w in model.wv]
    if len(words)==0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

X_cbow = np.array([get_sentence_vector(w2v_cbow, t) for t in df['clean_text']])
X_skip = np.array([get_sentence_vector(w2v_skip, t) for t in df['clean_text']])


In [18]:
# (f) fastText
from gensim.models import FastText
ft_model = FastText(sentences, vector_size=100, window=3, min_count=1)
X_fast = np.array([get_sentence_vector(ft_model, t) for t in df['clean_text']])

In [21]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Prepare tagged documents 
tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(df['clean_text'])]

# -------- Distributed Memory (DM) --------
model_dm = Doc2Vec(
    vector_size=100,
    window=5,
    min_count=1,
    dm=1,
    epochs=40,
    workers=4
)
model_dm.build_vocab(tagged_data)
model_dm.train(tagged_data, total_examples=model_dm.corpus_count, epochs=model_dm.epochs)

# -------- Distributed Bag of Words (DBOW) --------
model_dbow = Doc2Vec(
    vector_size=100,
    window=5,
    min_count=1,
    dm=0,
    epochs=40,
    workers=4
)
model_dbow.build_vocab(tagged_data)
model_dbow.train(tagged_data, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

# -------- Convert each sentence to vector --------
X_dm = np.array([model_dm.dv[str(i)] for i in range(len(tagged_data))])
X_dbow = np.array([model_dbow.dv[str(i)] for i in range(len(tagged_data))])

print("Doc2Vec (DM) Shape:", X_dm.shape)
print("Doc2Vec (DBOW) Shape:", X_dbow.shape)


Doc2Vec (DM) Shape: (61, 100)
Doc2Vec (DBOW) Shape: (61, 100)


In [22]:
# Model Training & Evaluation
def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)


In [23]:
y = df['label']

results = {
    "One Hot": evaluate_model(CountVectorizer(binary=True).fit_transform(df['clean_text']), y),
    "BoW": evaluate_model(CountVectorizer().fit_transform(df['clean_text']), y),
    "BoN": evaluate_model(CountVectorizer(ngram_range=(1,2)).fit_transform(df['clean_text']), y),
    "TF-IDF": evaluate_model(TfidfVectorizer().fit_transform(df['clean_text']), y),
    "Word2Vec CBOW": evaluate_model(X_cbow, y),
    "Word2Vec SkipGram": evaluate_model(X_skip, y),
    "fastText": evaluate_model(X_fast, y),
    "Doc2Vec DM": evaluate_model(X_dm, y),
    "Doc2Vec DBOW": evaluate_model(X_dbow, y)
}

print(pd.DataFrame(results.items(), columns=["Embedding", "Accuracy"]))


           Embedding  Accuracy
0            One Hot  0.769231
1                BoW  0.769231
2                BoN  0.769231
3             TF-IDF  0.692308
4      Word2Vec CBOW  0.384615
5  Word2Vec SkipGram  0.384615
6           fastText  0.384615
7         Doc2Vec DM  0.384615
8       Doc2Vec DBOW  0.384615
