In [1]:
#import libraries
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec, Doc2Vec
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.datasets import fetch_20newsgroups
from gensim.models.doc2vec import TaggedDocument
import numpy as np

In [2]:
# Fetch the dataset with the specified categories
categories = ['alt.atheism', 'talk.religion.misc']
data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [3]:
# Access only the first 100 documents and print their size.
subset_data_train = data.data[:100]
subset_data_train_labels = data.target[:100]

subset_data_test = test.data[:100]
subset_data_test_labels = test.target[:100]

print("Training data shape:", len(subset_data_train))
print("Training labels shape:", len(subset_data_train_labels))

Training data shape: 100
Training labels shape: 100


In [4]:
#Multinomial Naive Bayes

In [5]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=[doc.split() for doc in subset_data_train], vector_size=100, window=5, min_count=5, workers=4)

In [6]:
# Define the pipeline for Word2Vec with absolute transformation
def word2vec_transformer(X, word2vec_model):
    return np.array([np.abs(np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv] or [np.zeros(word2vec_model.vector_size)], axis=0)) for doc in X])

In [7]:
# Define the pipeline for Word2Vec
word2vec_pipeline = Pipeline([
    ('word2vec', FunctionTransformer(word2vec_transformer, kw_args={'word2vec_model': word2vec_model})),
    ('clf', MultinomialNB()),
])

In [8]:
# Naive Bayes parameters for Word2Vec
word2vec_parameters = {
    'clf__alpha': (0.1, 0.01, 0.001),
}


In [9]:
# Perform grid search for Word2Vec
word2vec_grid_search = GridSearchCV(word2vec_pipeline, word2vec_parameters, cv=2, n_jobs=-1)
word2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [10]:
# Print best parameters and accuracy for Word2Vec
print("\nBest Parameters for Word2Vec:", word2vec_grid_search.best_params_)
print("Best Accuracy for Word2Vec:", word2vec_grid_search.best_score_)


Best Parameters for Word2Vec: {'clf__alpha': 0.1}
Best Accuracy for Word2Vec: 0.56


In [11]:
# Train Doc2Vec model
doc2vec_model = Doc2Vec([TaggedDocument(doc.split(), [i]) for i, doc in enumerate(subset_data_train)], vector_size=100, window=5, min_count=5, workers=4)

In [12]:
# Define the pipeline for Doc2Vec with absolute transformation
def doc2vec_transformer(X, doc2vec_model):
    return np.array([np.abs(doc2vec_model.infer_vector(doc.split())) for doc in X])

In [13]:
# Define the pipeline for Doc2Vec
doc2vec_pipeline = Pipeline([
    ('doc2vec', FunctionTransformer(doc2vec_transformer, kw_args={'doc2vec_model': doc2vec_model})),
    ('clf', MultinomialNB()),
])

In [14]:
# Naive Bayes parameters for Doc2Vec
doc2vec_parameters = {
    'clf__alpha': (0.1, 0.01, 0.001),
}

In [15]:
# Perform grid search for Doc2Vec
doc2vec_grid_search = GridSearchCV(doc2vec_pipeline, doc2vec_parameters, cv=2, n_jobs=-1)
doc2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [16]:
# Print best parameters and accuracy for Doc2Vec
print("\nBest Parameters for Doc2Vec:", doc2vec_grid_search.best_params_)
print("Best Accuracy for Doc2Vec:", doc2vec_grid_search.best_score_)


Best Parameters for Doc2Vec: {'clf__alpha': 0.1}
Best Accuracy for Doc2Vec: 0.62


In [17]:
# Define the pipeline for CountVectorizer
count_vectorizer_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', MultinomialNB()),
])

In [18]:
# Naive Bayes parameters for CountVectorizer
count_vectorizer_parameters = {
    'count_vectorizer__max_df': (0.5, 0.75, 1.0),
    'count_vectorizer__ngram_range': ((1, 1), (1, 2)),
    'count_vectorizer__max_features': (None, 1000, 5000),
    'clf__alpha': (0.1, 0.01, 0.001),
}


In [19]:
# Perform grid search for CountVectorizer
count_vectorizer_grid_search = GridSearchCV(count_vectorizer_pipeline, count_vectorizer_parameters, cv=2, n_jobs=-1)
count_vectorizer_grid_search.fit(subset_data_train, subset_data_train_labels)

In [20]:
# Print best parameters and accuracy for CountVectorizer
print("\nBest Parameters for CountVectorizer:", count_vectorizer_grid_search.best_params_)
print("Best Accuracy for CountVectorizer:", count_vectorizer_grid_search.best_score_)


Best Parameters for CountVectorizer: {'clf__alpha': 0.001, 'count_vectorizer__max_df': 0.5, 'count_vectorizer__max_features': 5000, 'count_vectorizer__ngram_range': (1, 2)}
Best Accuracy for CountVectorizer: 0.8500000000000001


In [21]:
#Logistic Regression

In [22]:
# Define the pipeline for CountVectorizer
count_vectorizer_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=1000)),
])

In [23]:
# Define the pipeline for Word2Vec
word2vec_pipeline = Pipeline([
    ('word2vec', FunctionTransformer(word2vec_transformer, kw_args={'word2vec_model': word2vec_model})),
    ('clf', LogisticRegression(max_iter=1000)),
])

In [24]:
# Define the pipeline for Doc2Vec
doc2vec_pipeline = Pipeline([
    ('doc2vec', FunctionTransformer(doc2vec_transformer, kw_args={'doc2vec_model': doc2vec_model})),
    ('clf', LogisticRegression(max_iter=1000)),
])

In [25]:
# LogisticRegression parameters for CountVectorizer
count_vectorizer_parameters = {
    'count_vectorizer__max_df': (0.5, 0.75, 1.0),
    'count_vectorizer__ngram_range': ((1, 1), (1, 2)),
    'count_vectorizer__max_features': (None, 1000, 5000),
    'clf__C': (0.1, 1, 10),
}

In [26]:
# LogisticRegression parameters for Word2Vec
word2vec_parameters = {
    'clf__C': (0.1, 1, 10),
}


In [27]:
# LogisticRegression parameters for Doc2Vec
doc2vec_parameters = {
    'clf__C': (0.1, 1, 10),
}

In [28]:
# Perform grid search for CountVectorizer
count_vectorizer_grid_search = GridSearchCV(count_vectorizer_pipeline, count_vectorizer_parameters, cv=2, n_jobs=-1)
count_vectorizer_grid_search.fit(subset_data_train, subset_data_train_labels)


In [29]:
# Perform grid search for Word2Vec
word2vec_grid_search = GridSearchCV(word2vec_pipeline, word2vec_parameters, cv=2, n_jobs=-1)
word2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [30]:
# Perform grid search for Doc2Vec
doc2vec_grid_search = GridSearchCV(doc2vec_pipeline, doc2vec_parameters, cv=2, n_jobs=-1)
doc2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [31]:
# Print best parameters and accuracy for CountVectorizer
print("\nBest Parameters for CountVectorizer:", count_vectorizer_grid_search.best_params_)
print("Best Accuracy for CountVectorizer:", count_vectorizer_grid_search.best_score_)


Best Parameters for CountVectorizer: {'clf__C': 0.1, 'count_vectorizer__max_df': 0.5, 'count_vectorizer__max_features': 1000, 'count_vectorizer__ngram_range': (1, 1)}
Best Accuracy for CountVectorizer: 0.71


In [32]:
# Print best parameters and accuracy for Word2Vec
print("\nBest Parameters for Word2Vec:", word2vec_grid_search.best_params_)
print("Best Accuracy for Word2Vec:", word2vec_grid_search.best_score_)


Best Parameters for Word2Vec: {'clf__C': 1}
Best Accuracy for Word2Vec: 0.5700000000000001


In [33]:
# Print best parameters and accuracy for Doc2Vec
print("\nBest Parameters for Doc2Vec:", doc2vec_grid_search.best_params_)
print("Best Accuracy for Doc2Vec:", doc2vec_grid_search.best_score_)


Best Parameters for Doc2Vec: {'clf__C': 1}
Best Accuracy for Doc2Vec: 0.6200000000000001


In [34]:
#support vector machine

In [35]:
# Define the pipeline for CountVectorizer
count_vectorizer_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', SVC()),
])

In [36]:
# Define the pipeline for Word2Vec
word2vec_pipeline = Pipeline([
    ('word2vec', FunctionTransformer(word2vec_transformer, kw_args={'word2vec_model': word2vec_model})),
    ('clf', SVC()),
])

In [37]:
# Define the pipeline for Doc2Vec
doc2vec_pipeline = Pipeline([
    ('doc2vec', FunctionTransformer(doc2vec_transformer, kw_args={'doc2vec_model': doc2vec_model})),
    ('clf', SVC()),
])

In [38]:
# SVM parameters for CountVectorizer
count_vectorizer_parameters = {
    'count_vectorizer__max_df': (0.5, 0.75, 1.0),
    'count_vectorizer__ngram_range': ((1, 1), (1, 2)),
    'count_vectorizer__max_features': (None, 1000, 5000),
    'clf__C': (0.1, 1, 10),
}

In [39]:
# SVM parameters for Word2Vec
word2vec_parameters = {
    'clf__C': (0.1, 1, 10),
}

In [40]:
# SVM parameters for Doc2Vec
doc2vec_parameters = {
    'clf__C': (0.1, 1, 10),
}

In [41]:
# Perform grid search for CountVectorizer
count_vectorizer_grid_search = GridSearchCV(count_vectorizer_pipeline, count_vectorizer_parameters, cv=2, n_jobs=-1)
count_vectorizer_grid_search.fit(subset_data_train, subset_data_train_labels)


In [42]:
# Perform grid search for Word2Vec
word2vec_grid_search = GridSearchCV(word2vec_pipeline, word2vec_parameters, cv=2, n_jobs=-1)
word2vec_grid_search.fit(subset_data_train, subset_data_train_labels)


In [43]:
# Perform grid search for Doc2Vec
doc2vec_grid_search = GridSearchCV(doc2vec_pipeline, doc2vec_parameters, cv=2, n_jobs=-1)
doc2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [44]:
# Print best parameters and accuracy for CountVectorizer
print("\nBest Parameters for CountVectorizer:", count_vectorizer_grid_search.best_params_)
print("Best Accuracy for CountVectorizer:", count_vectorizer_grid_search.best_score_)


Best Parameters for CountVectorizer: {'clf__C': 10, 'count_vectorizer__max_df': 1.0, 'count_vectorizer__max_features': None, 'count_vectorizer__ngram_range': (1, 2)}
Best Accuracy for CountVectorizer: 0.69


In [45]:
# Print best parameters and accuracy for Word2Vec
print("\nBest Parameters for Word2Vec:", word2vec_grid_search.best_params_)
print("Best Accuracy for Word2Vec:", word2vec_grid_search.best_score_)



Best Parameters for Word2Vec: {'clf__C': 0.1}
Best Accuracy for Word2Vec: 0.56


In [46]:
# Print best parameters and accuracy for Doc2Vec
print("\nBest Parameters for Doc2Vec:", doc2vec_grid_search.best_params_)
print("Best Accuracy for Doc2Vec:", doc2vec_grid_search.best_score_)


Best Parameters for Doc2Vec: {'clf__C': 10}
Best Accuracy for Doc2Vec: 0.5800000000000001


In [47]:
#Decision tree

In [48]:
# Define the pipeline for CountVectorizer
count_vectorizer_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', DecisionTreeClassifier()),
])

In [49]:
# Define the pipeline for Word2Vec
word2vec_pipeline = Pipeline([
    ('word2vec', FunctionTransformer(word2vec_transformer, kw_args={'word2vec_model': word2vec_model})),
    ('clf', DecisionTreeClassifier()),
])

In [50]:
# Define the pipeline for Doc2Vec
doc2vec_pipeline = Pipeline([
    ('doc2vec', FunctionTransformer(doc2vec_transformer, kw_args={'doc2vec_model': doc2vec_model})),
    ('clf', DecisionTreeClassifier()),
])

In [51]:
# Decision Tree parameters for CountVectorizer
count_vectorizer_parameters = {
    'count_vectorizer__max_df': (0.5, 0.75, 1.0),
    'count_vectorizer__ngram_range': ((1, 1), (1, 2)),
    'count_vectorizer__max_features': (None, 1000, 5000),
    'clf__max_depth': (None, 10, 20),
}

In [52]:
# Decision Tree parameters for Word2Vec
word2vec_parameters = {
    'clf__max_depth': (None, 10, 20),
}

In [53]:
# Decision Tree parameters for Doc2Vec
doc2vec_parameters = {
    'clf__max_depth': (None, 10, 20),
}

In [54]:
# Perform grid search for CountVectorizer
count_vectorizer_grid_search = GridSearchCV(count_vectorizer_pipeline, count_vectorizer_parameters, cv=2, n_jobs=-1)
count_vectorizer_grid_search.fit(subset_data_train, subset_data_train_labels)

In [55]:
# Perform grid search for Word2Vec
word2vec_grid_search = GridSearchCV(word2vec_pipeline, word2vec_parameters, cv=2, n_jobs=-1)
word2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [56]:
# Perform grid search for Doc2Vec
doc2vec_grid_search = GridSearchCV(doc2vec_pipeline, doc2vec_parameters, cv=2, n_jobs=-1)
doc2vec_grid_search.fit(subset_data_train, subset_data_train_labels)

In [57]:
# Print best parameters and accuracy for CountVectorizer
print("\nBest Parameters for CountVectorizer:", count_vectorizer_grid_search.best_params_)
print("Best Accuracy for CountVectorizer:", count_vectorizer_grid_search.best_score_)


Best Parameters for CountVectorizer: {'clf__max_depth': None, 'count_vectorizer__max_df': 1.0, 'count_vectorizer__max_features': None, 'count_vectorizer__ngram_range': (1, 2)}
Best Accuracy for CountVectorizer: 0.6699999999999999


In [58]:
# Print best parameters and accuracy for Word2Vec
print("\nBest Parameters for Word2Vec:", word2vec_grid_search.best_params_)
print("Best Accuracy for Word2Vec:", word2vec_grid_search.best_score_)


Best Parameters for Word2Vec: {'clf__max_depth': 20}
Best Accuracy for Word2Vec: 0.52


In [59]:
# Print best parameters and accuracy for Doc2Vec
print("\nBest Parameters for Doc2Vec:", doc2vec_grid_search.best_params_)
print("Best Accuracy for Doc2Vec:", doc2vec_grid_search.best_score_)



Best Parameters for Doc2Vec: {'clf__max_depth': 20}
Best Accuracy for Doc2Vec: 0.59


In [60]:
# Results together and find out the best model
results = {}

# Multinomial Naive Bayes
results[('Multinomial Naive Bayes', 'CountVectorizer')] = count_vectorizer_grid_search.best_score_
results[('Multinomial Naive Bayes', 'Word2Vec')] = word2vec_grid_search.best_score_
results[('Multinomial Naive Bayes', 'Doc2Vec')] = doc2vec_grid_search.best_score_

# Logistic Regression
results[('Logistic Regression', 'CountVectorizer')] = count_vectorizer_grid_search.best_score_
results[('Logistic Regression', 'Word2Vec')] = word2vec_grid_search.best_score_
results[('Logistic Regression', 'Doc2Vec')] = doc2vec_grid_search.best_score_

# Support Vector Machines
results[('Support Vector Machines', 'CountVectorizer')] = count_vectorizer_grid_search.best_score_
results[('Support Vector Machines', 'Word2Vec')] = word2vec_grid_search.best_score_
results[('Support Vector Machines', 'Doc2Vec')] = doc2vec_grid_search.best_score_

# Decision Trees
results[('Decision Trees', 'CountVectorizer')] = count_vectorizer_grid_search.best_score_
results[('Decision Trees', 'Word2Vec')] = word2vec_grid_search.best_score_
results[('Decision Trees', 'Doc2Vec')] = doc2vec_grid_search.best_score_

# Print results
for key, value in results.items():
    print(f"{key}: {value}")
best_accuracy = 0
best_algorithm = ""
best_feature_extractor = ""

for key, value in results.items():
    if value > best_accuracy:
        best_accuracy = value
        best_algorithm, best_feature_extractor = key

print(f"Best Algorithm: {best_algorithm}")
print(f"Best Feature Extractor: {best_feature_extractor}")
print(f"Best Accuracy: {best_accuracy}")


('Multinomial Naive Bayes', 'CountVectorizer'): 0.6699999999999999
('Multinomial Naive Bayes', 'Word2Vec'): 0.52
('Multinomial Naive Bayes', 'Doc2Vec'): 0.59
('Logistic Regression', 'CountVectorizer'): 0.6699999999999999
('Logistic Regression', 'Word2Vec'): 0.52
('Logistic Regression', 'Doc2Vec'): 0.59
('Support Vector Machines', 'CountVectorizer'): 0.6699999999999999
('Support Vector Machines', 'Word2Vec'): 0.52
('Support Vector Machines', 'Doc2Vec'): 0.59
('Decision Trees', 'CountVectorizer'): 0.6699999999999999
('Decision Trees', 'Word2Vec'): 0.52
('Decision Trees', 'Doc2Vec'): 0.59
Best Algorithm: Multinomial Naive Bayes
Best Feature Extractor: CountVectorizer
Best Accuracy: 0.6699999999999999


In [61]:
# Save the data in a text file
result_file_path = r'C:\Users\16395\OneDrive\Desktop\exam\file\Pujan_Task0_Text_Classification.txt'

# Save results to the specified file path
with open(result_file_path, 'w+') as file:
    for key, value in results.items():
        file.write(f"{key}: {value}\n")

    # Determine the best algorithm, best feature extractor, and their corresponding best accuracy
    best_accuracy = 0
    best_algorithm = ""
    best_feature_extractor = ""

    for key, value in results.items():
        if value > best_accuracy:
            best_accuracy = value
            best_algorithm, best_feature_extractor = key

    # Write the information about the best algorithm, best feature extractor, and their corresponding best accuracy to the file
    file.write(f"\nBest Algorithm: {best_algorithm}\n")
    file.write(f"Best Feature Extractor: {best_feature_extractor}\n")
    file.write(f"Best Accuracy: {best_accuracy}\n")

print("Results saved successfully.")


Results saved successfully.
