# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [1]:
!pip install gensim



In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Word2Vec, Doc2Vec

### Choose a few categories fro the entire 20 categories

In [3]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [4]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories

In [5]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



### Define a pipeline combining a text feature extractor with a simple classifier

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(tol=1e-3)),
])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [7]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [8]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                           n_jobs=-1, verbose=1)

### Start the grid search

In [9]:
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


### Best Score

In [10]:
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.954


### Best Parameter

In [11]:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


### Choose the best model

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=1)

In [13]:
class Word2VecVectorizer:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        vectors = []
        for doc in X:
            word_vectors = [self.model.wv[word] for word in doc.split() if word in self.model.wv]
            if word_vectors:
                vector_mean = np.mean(word_vectors, axis=0)
                vector = np.maximum(0, vector_mean)
            else:
                vector = np.zeros(self.model.vector_size)
            vectors.append(vector)
        return np.array(vectors)

In [14]:
class Doc2VecVectorizer:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        vectors = []
        for doc in X:
            vector = self.model.infer_vector(doc.split())
            vectors.append(np.maximum(0, vector))
        return np.array(vectors)

In [15]:
word2vec_model = Word2Vec(
    sentences=[doc.split() for doc in X_train], 
    vector_size=100, 
    window=5, 
    min_count=1, 
    workers=4
)

In [16]:
doc2vec_model = Doc2Vec(
    [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(X_train)], 
    vector_size=100, 
    window=5, 
    min_count=1, 
    workers=4
)

In [17]:
pipelines = [
    ('MultinomialNB_CountVectorizer', 
         Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=1e-05))])
    ), 
    ('MultinomialNB_TfidfVectorizer', 
         Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=1e-05))])
    ),
    ('MultinomialNB_Word2Vec', 
         Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', MultinomialNB(alpha=1e-05))])
    ),
    ('MultinomialNB_Doc2Vec', 
         Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', MultinomialNB(alpha=1e-05))])
    ),
    ('LogisticRegression_CountVectorizer', 
         Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', LogisticRegression(max_iter=100, penalty='l2'))])
    ), 
    ('LogisticRegression_TfidfVectorizer', 
         Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', LogisticRegression(max_iter=100, penalty='l2'))])
    ), 
    ('LogisticRegression_Word2Vec', 
         Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', LogisticRegression(max_iter=100, penalty='l2'))])
    ),
    ('LogisticRegression_Doc2Vec', 
         Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', LogisticRegression(max_iter=100, penalty='l2'))])
    ),
    ('SVM_CountVectorizer', 
         Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', SVC(kernel='rbf'))])
    ), 
    ('SVM_TfidfVectorizer', 
         Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', SVC(kernel='rbf'))])
    ),
    ('SVM_Word2Vec', 
         Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', SVC(kernel='rbf'))])
    ),
    ('SVM_Doc2Vec', 
         Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', SVC(kernel='rbf'))])
    ),
    ('DecisionTree_CountVectorizer', 
         Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', DecisionTreeClassifier())])
    ), 
    ('DecisionTree_TfidfVectorizer', 
         Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', DecisionTreeClassifier())])
    ),
    ('DecisionTree_Word2Vec', 
         Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', DecisionTreeClassifier())])
    ),
    ('DecisionTree_Doc2Vec', 
         Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', DecisionTreeClassifier())])
    )
]

In [18]:
vectorizer_results = {}
for vectorizer_name, pipeline in pipelines:
    print(f"Evaluating: {vectorizer_name}")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    vectorizer_results[vectorizer_name] = classification_report(y_test, y_pred, zero_division=0)

Evaluating: MultinomialNB_CountVectorizer
Evaluating: MultinomialNB_TfidfVectorizer
Evaluating: MultinomialNB_Word2Vec
Evaluating: MultinomialNB_Doc2Vec
Evaluating: LogisticRegression_CountVectorizer
Evaluating: LogisticRegression_TfidfVectorizer
Evaluating: LogisticRegression_Word2Vec
Evaluating: LogisticRegression_Doc2Vec
Evaluating: SVM_CountVectorizer
Evaluating: SVM_TfidfVectorizer
Evaluating: SVM_Word2Vec
Evaluating: SVM_Doc2Vec
Evaluating: DecisionTree_CountVectorizer
Evaluating: DecisionTree_TfidfVectorizer
Evaluating: DecisionTree_Word2Vec
Evaluating: DecisionTree_Doc2Vec


In [19]:
with open("Riya_Task0_Text_Classification.txt", "w") as file:
    for vectorizer_name, result in vectorizer_results.items():
        file.write(f"{vectorizer_name}:\n{result}\n\n")

### Use the model to classify a piece of text

In [20]:
categories= ['comp.sys.mac.hardware', 'sci.electronics']
data_train = fetch_20newsgroups(subset='train', categories=categories)

In [21]:
sample_data = data_train.data[0]

In [22]:
best_model = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=100, penalty='l2')),
])

In [23]:
best_model.fit(data.data, data.target)

In [24]:
predicted_category = best_model.predict([sample_data])[0]
predicted_category_name = data_train.target_names[predicted_category]

print("Predicted category:", predicted_category_name)

Predicted category: sci.electronics
