## Text Classification

In [16]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec, Doc2Vec
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from gensim.models.doc2vec import TaggedDocument
from pprint import pprint
from time import time
import logging
import pandas as pd
import numpy as np

## Fetch 20newsgroups

In [17]:
newsgroups = fetch_20newsgroups(subset='train')
X= newsgroups.data
y =newsgroups.target

## Load Categories

In [18]:
categories = [
    'comp.graphics',
    'sci.space',
    'misc.forsale',   
]
print("Loading 20 newsgroups dataset for categories:")
print(categories)


Loading 20 newsgroups dataset for categories:
['comp.graphics', 'sci.space', 'misc.forsale']


## Split into train and test

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Countvectorizer

In [22]:
countvectorizer = CountVectorizer()
X_train_counts = countvectorizer.fit_transform(X_train)
X_test_counts = countvectorizer.transform(X_test)
print(X_train_counts.shape,X_test_counts.shape)

(9051, 117598) (2263, 117598)


## Word2vec

In [23]:
word2vec = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=100, window=5, min_count=1, sg=0)
def document_to_w2v(document, model, vector_size):
    words = document.split()
    vector = np.zeros(vector_size)
    n_words = 0
    for word in words:
        if word in model.wv:
            word_vector = model.wv[word]
            word_vector = np.maximum(word_vector, 0)
            vector = np.add(vector, word_vector)
            n_words += 1
    if n_words > 0:
        return np.divide(vector, n_words)
    else:
        return vector
    
X_train_w2v = [document_to_w2v(doc, word2vec, 100) for doc in X_train]
X_test_w2v = [document_to_w2v(doc, word2vec, 100) for doc in X_test]

## Doc2vec

In [8]:
doc2vec_corpus = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(X_train)]
doc2vec = Doc2Vec(doc2vec_corpus, vector_size=100, window=5, min_count=1, epochs=10, dm=0)

def document_to_d2v(document, model, vector_size):
    words = document.split()
    vector = model.infer_vector(words)
    vector = np.maximum(vector, 0)
    return vector

X_train_d2v = [document_to_d2v(doc, doc2vec, 100) for doc in X_train]
X_test_d2v = [document_to_d2v(doc, doc2vec, 100) for doc in X_test]

## Define algorithms with hyperparameter grids

In [9]:
algorithms = {
    "Multinomial Naive Bayes": {
        "model": MultinomialNB(),
        "param_grid": {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}
    },
    "Logistic Regression": {
        "model": LogisticRegression(),
        "param_grid": {'C': [0.001, 0.01, 0.1, 1.0, 10.0]}
    },
    "Support Vector Machine": {
        "model": SVC(),
        "param_grid": {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "param_grid": {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]}
    
    }
}

## Benchmark analysis 

In [10]:
output = []

In [11]:
for algorithm_name, algo_config in algorithms.items():
    for feature_extractor_name, X_train_features, X_test_features in [
        ("CountVectorizer", X_train_counts, X_test_counts),
        ("Word2Vec", X_train_w2v, X_test_w2v),
        ("Doc2Vec", X_train_d2v, X_test_d2v)
    ]:
        grid_search = GridSearchCV(algo_config["model"], algo_config["param_grid"], cv=3)
        grid_search.fit(X_train_features, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_test_features)
        accuracy = accuracy_score(y_test, y_pred)
        output.append([algorithm_name, feature_extractor_name, best_params, accuracy])

data = ["Algorithm", "Feature Extractor", "Best Parameters", "Accuracy"]
print(data)
print(output)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

['Algorithm', 'Feature Extractor', 'Best Parameters', 'Accuracy']
[['Multinomial Naive Bayes', 'CountVectorizer', {'alpha': 0.01}, 0.8842244807777286], ['Multinomial Naive Bayes', 'Word2Vec', {'alpha': 0.01}, 0.1780821917808219], ['Multinomial Naive Bayes', 'Doc2Vec', {'alpha': 0.01}, 0.5877154220061864], ['Logistic Regression', 'CountVectorizer', {'C': 0.1}, 0.8727353071144498], ['Logistic Regression', 'Word2Vec', {'C': 10.0}, 0.3588157313300928], ['Logistic Regression', 'Doc2Vec', {'C': 1.0}, 0.6747680070702607], ['Support Vector Machine', 'CountVectorizer', {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}, 0.827220503756076], ['Support Vector Machine', 'Word2Vec', {'C': 10.0, 'gamma': 'scale', 'kernel': 'linear'}, 0.4334953601414052], ['Support Vector Machine', 'Doc2Vec', {'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}, 0.691117984975696], ['Decision Tree', 'CountVectorizer', {'max_depth': None, 'min_samples_split': 5}, 0.6517896597437031], ['Decision Tree', 'Word2Vec', {'max_depth':

In [24]:
with open("SimranNModi_Task0_Text_Classification.txt", "w") as file:
    file.write("Algorithm\tFeature Extractor\tBest Parameters\t\t\t\tAccuracy\n")
    for row in output:
        file.write("\t".join(map(str, row)) + "\n")

print("Results written to SimranNModi_Task0_Text_Classification.txt")


Results written to SimranNModi_Task0_Text_Classification.txt
