# Text Classification using N-Grams

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data
y = newsgroups.target

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=4564)

### NGRAM Range (1,2)

In [None]:
ngram_range = (1, 2)
max_features = 5000

In [None]:
vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=max_features)
xtrain_ngrams = vectorizer.fit_transform(xtrain)
xtest_ngrams = vectorizer.transform(xtest)

In [None]:
clf = MultinomialNB()
clf.fit(xtrain_ngrams, ytrain)

In [None]:
ypred = clf.predict(xtest_ngrams)

In [None]:
accuracy = accuracy_score(ytest, ypred)
print("Accuracy:", accuracy)

Accuracy: 0.5732224973470109


In [None]:
print(classification_report(ytest, ypred, target_names=newsgroups.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.37      0.60      0.45       217
           comp.graphics       0.48      0.59      0.53       293
 comp.os.ms-windows.misc       0.90      0.03      0.06       290
comp.sys.ibm.pc.hardware       0.42      0.64      0.51       292
   comp.sys.mac.hardware       0.51      0.58      0.54       290
          comp.windows.x       0.66      0.62      0.64       304
            misc.forsale       0.74      0.75      0.74       317
               rec.autos       0.56      0.63      0.59       292
         rec.motorcycles       0.53      0.64      0.58       308
      rec.sport.baseball       0.61      0.73      0.66       304
        rec.sport.hockey       0.87      0.54      0.67       308
               sci.crypt       0.80      0.67      0.73       290
         sci.electronics       0.54      0.55      0.55       296
                 sci.med       0.76      0.56      0.65       295
         

### NGRAM Range (1,10)

In [None]:
ngram_range = (1, 10)
max_features = 5000

In [None]:
vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=max_features)
xtrain_ngrams = vectorizer.fit_transform(xtrain)
xtest_ngrams = vectorizer.transform(xtest)

In [None]:
clf = MultinomialNB()
clf.fit(xtrain_ngrams, ytrain)

In [None]:
ypred = clf.predict(xtest_ngrams)

In [None]:
accuracy = accuracy_score(ytest, ypred)
print("Accuracy:", accuracy)

Accuracy: 0.5649097983728334


In [None]:
print(classification_report(ytest, ypred, target_names=newsgroups.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.35      0.60      0.44       217
           comp.graphics       0.46      0.60      0.52       293
 comp.os.ms-windows.misc       1.00      0.02      0.05       290
comp.sys.ibm.pc.hardware       0.42      0.64      0.51       292
   comp.sys.mac.hardware       0.50      0.56      0.53       290
          comp.windows.x       0.67      0.59      0.62       304
            misc.forsale       0.72      0.75      0.74       317
               rec.autos       0.55      0.63      0.59       292
         rec.motorcycles       0.54      0.64      0.58       308
      rec.sport.baseball       0.59      0.72      0.65       304
        rec.sport.hockey       0.85      0.50      0.63       308
               sci.crypt       0.79      0.67      0.73       290
         sci.electronics       0.55      0.55      0.55       296
                 sci.med       0.76      0.55      0.64       295
         

### NGRAM Range (5,10)

In [None]:
ngram_range = (5, 10)
max_features = 5000

In [None]:
vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=max_features)
xtrain_ngrams = vectorizer.fit_transform(xtrain)
xtest_ngrams = vectorizer.transform(xtest)

In [None]:
clf = MultinomialNB()
clf.fit(xtrain_ngrams, ytrain)

In [None]:
ypred = clf.predict(xtest_ngrams)

In [None]:
accuracy = accuracy_score(ytest, ypred)
print("Accuracy:", accuracy)

Accuracy: 0.09816059426954368


In [None]:
print(classification_report(ytest, ypred, target_names=newsgroups.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.55      0.05      0.09       217
           comp.graphics       0.18      0.03      0.05       293
 comp.os.ms-windows.misc       1.00      0.02      0.04       290
comp.sys.ibm.pc.hardware       0.14      0.03      0.05       292
   comp.sys.mac.hardware       0.12      0.02      0.03       290
          comp.windows.x       0.36      0.05      0.08       304
            misc.forsale       0.47      0.09      0.15       317
               rec.autos       0.04      0.01      0.01       292
         rec.motorcycles       0.76      0.07      0.13       308
      rec.sport.baseball       0.57      0.06      0.10       304
        rec.sport.hockey       0.63      0.04      0.07       308
               sci.crypt       0.88      0.05      0.10       290
         sci.electronics       0.35      0.05      0.09       296
                 sci.med       0.83      0.10      0.18       295
         