In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec, FastText
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv('/content/OnionOrNot.csv')
data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label
0,Entire Facebook Staff Laughs As Man Tightens P...,1
1,Muslim Woman Denied Soda Can for Fear She Coul...,0
2,Bold Move: Hulu Has Announced That They’re Gon...,1
3,Despondent Jeff Bezos Realizes He’ll Have To W...,1
4,"For men looking for great single women, online...",1
...,...,...
23995,Teen Pregnancy Rate Prompting More High School...,1
23996,Connecticut TV station under fire after using ...,0
23997,Jurisprudence Fetishist Gets Off On Technicality,1
23998,Employees From Other Department Announce Plan ...,1


In [14]:
data.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [15]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,15000
1,9000


In [26]:
vectorizers = {
    "Word2Vec": "Word2Vec",
    "FastText": "FastText"
}

In [27]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Linear SVC": LinearSVC(),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVC (RBF kernel)": SVC(kernel='rbf'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
}

In [28]:
'''
Text Vectorization:
Texts are converted into numerical representations using two methods—Word2Vec and FastText.
For each document, tokenization is performed by splitting the text into individual words.
The respective embedding model is then trained on these tokens, and the average of the word vectors is computed
for each text, resulting in a fixed-length numerical representation.
'''
def generate_word2vec_embeddings(data, vector_size=100, window=5, min_count=1):
    tokenized_texts = [text.split() for text in data['text']]
    w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count)
    embeddings = []
    for tokens in tokenized_texts:
        word_vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

def generate_fasttext_embeddings(data, vector_size=100, window=5, min_count=1):
    tokenized_texts = [text.split() for text in data['text']]
    ft_model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count)

    embeddings = []
    for tokens in tokenized_texts:
        word_vectors = [ft_model.wv[word] for word in tokens if word in ft_model.wv]
        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)



In [30]:
'''
The generated embeddings, along with their corresponding labels, are used to split the dataset into training and test sets.
A stratified split is applied to ensure that the class proportions are maintained in both sets,
leading to a more reliable evaluation of model performance.
'''
for vec_name, vectorizer in vectorizers.items():
    print(f"\nResults for Vectorizer: {vec_name}")
    print("=" * 50)
    if vec_name == "Word2Vec":
        X_encoded = generate_word2vec_embeddings(data)
    elif vec_name == "FastText":
        X_encoded = generate_fasttext_embeddings(data)

    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y, test_size=0.2, random_state=42, stratify=y
    )

    for model_name, model in models.items():
        print(f"Model: {model_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print("=" * 50)
'''
For each combination of vectorization method (Word2Vec and FastText) and classification model (Logistic Regression, Random Forest, XGBoost...),
the following steps are performed:

- The model is trained on the training data.
- The trained model predicts the labels for the test set.
- Key metrics—accuracy, precision, recall, and F1-score—are computed to evaluate the performance of each approach.
'''


Results for Vectorizer: Word2Vec
Model: Logistic Regression
Accuracy: 0.9092
Precision: 0.8994
Recall: 0.9151
F1-Score: 0.9052
Model: Random Forest
Accuracy: 0.9100
Precision: 0.9005
Recall: 0.9132
F1-Score: 0.9056
Model: Linear SVC
Accuracy: 0.9060
Precision: 0.8962
Recall: 0.9121
F1-Score: 0.9020
Model: KNN (k=5)
Accuracy: 0.9033
Precision: 0.8940
Recall: 0.9039
F1-Score: 0.8982
Model: SVC (RBF kernel)
Accuracy: 0.9060
Precision: 0.8962
Recall: 0.9112
F1-Score: 0.9018
Model: Gradient Boosting
Accuracy: 0.9090
Precision: 0.8992
Recall: 0.9146
F1-Score: 0.9049
Model: XGBoost
Accuracy: 0.9102
Precision: 0.9009
Recall: 0.9126
F1-Score: 0.9057

Results for Vectorizer: FastText
Model: Logistic Regression
Accuracy: 0.9060
Precision: 0.8962
Recall: 0.9121
F1-Score: 0.9020
Model: Random Forest
Accuracy: 0.9077
Precision: 0.8983
Recall: 0.9099
F1-Score: 0.9031
Model: Linear SVC
Accuracy: 0.9054
Precision: 0.8956
Recall: 0.9130
F1-Score: 0.9016
Model: KNN (k=5)
Accuracy: 0.9002
Precision: 0.89

'\nFor each combination of vectorization method (Word2Vec and FastText) and classification model (Logistic Regression, Random Forest, XGBoost...), \nthe following steps are performed:\n\n- The model is trained on the training data.\n- The trained model predicts the labels for the test set.\n- Key metrics—accuracy, precision, recall, and F1-score—are computed to evaluate the performance of each approach.\n'