This task tests your ability to solve a real-world problem using the concepts learned in Text Classification module.
Create a benchmark analysis with different algorithms and feature extractors.

Dataset: Fetch 20 Newsgroups (same as in class work)​

Algorithms: Multinomial Naïve Bayes, Logistic Regression, Support Vector Machines, Decision Trees​

Feature Extractors: CountVectorizer, Word2Vec, Doc2Vec and so on​

​

Benchmark all the possible above configurations and choose the best algorithm and feature extractor amongst all configurations​ and put it in a .txt or .doc file in a tabular format.

In [19]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim import models
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from sklearn import preprocessing

In [20]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=1)


In [22]:
results = []

In [26]:
algorithms = [
   # ("Multinomial Naïve Bayes", MultinomialNB()),
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Support Vector Machines", SVC()),
    ("Decision Trees", DecisionTreeClassifier())
]

In [27]:
feature_extractors = [
   # ("CountVectorizer", CountVectorizer()),
    ("Word2Vec",Word2Vec()),
    ("Doc2Vec",Doc2Vec())
#
 ]

In [28]:
for algorithm_name, algorithm in algorithms:
    for feature_extractor_name, feature_extractor in feature_extractors:
        # Prepare the data using the feature extractor
        if feature_extractor_name == "CountVectorizer":
            X_train_transformed = feature_extractor.fit_transform(X_train)
            X_test_transformed = feature_extractor.transform(X_test)
        elif feature_extractor_name == "Word2Vec":
            tokenized_X_train = [simple_preprocess(text) for text in X_train]
            tokenized_X_test = [simple_preprocess(text) for text in X_test]
            model = Word2Vec(sentences=tokenized_X_train, vector_size=100, window=5, min_count=1, sg=0)
            X_train_transformed = np.array([np.mean([model.wv[word] for word in words if word in model.wv] or [np.zeros(100)], axis=0) for words in tokenized_X_train])
            X_test_transformed = np.array([np.mean([model.wv[word] for word in words if word in model.wv] or [np.zeros(100)], axis=0) for words in tokenized_X_test])
        elif feature_extractor_name == "Doc2Vec":
            tokenized_X_train = [simple_preprocess(text) for text in X_train]
            tokenized_X_test = [simple_preprocess(text) for text in X_test]
            tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(tokenized_X_train)]
            model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=30)
            model.build_vocab(tagged_data)
            model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
            X_train_transformed = np.array([model.dv[i] for i in range(len(tokenized_X_train))])
            X_test_transformed = np.array([model.infer_vector(words) for words in tokenized_X_test])
            X_test_transformed = preprocessing.normalize(X_test_transformed)

        # Fit the model and make predictions
        algorithm.fit(X_train_transformed, y_train)
        y_pred = algorithm.predict(X_test_transformed)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        results.append((algorithm_name, feature_extractor_name, accuracy, precision, recall, f1))


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
for algorithm_name, feature_extractor_name, accuracy, precision, recall, f1 in results:
    print(f"Algorithm: {algorithm_name}, Feature Extractor: {feature_extractor_name}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
    print("----------------------------------------------------")

Algorithm: Multinomial Naïve Bayes, Feature Extractor: CountVectorizer
Accuracy: 0.62, Precision: 0.68, Recall: 0.60, F1-Score: 0.59
----------------------------------------------------
Algorithm: Logistic Regression, Feature Extractor: CountVectorizer
Accuracy: 0.68, Precision: 0.68, Recall: 0.67, F1-Score: 0.68
----------------------------------------------------
Algorithm: Support Vector Machines, Feature Extractor: CountVectorizer
Accuracy: 0.12, Precision: 0.50, Recall: 0.12, F1-Score: 0.10
----------------------------------------------------
Algorithm: Decision Trees, Feature Extractor: CountVectorizer
Accuracy: 0.48, Precision: 0.47, Recall: 0.46, F1-Score: 0.47
----------------------------------------------------
Algorithm: Logistic Regression, Feature Extractor: Word2Vec
Accuracy: 0.46, Precision: 0.45, Recall: 0.45, F1-Score: 0.44
----------------------------------------------------
Algorithm: Logistic Regression, Feature Extractor: Doc2Vec
Accuracy: 0.51, Precision: 0.62, Re