# Importing required Libraries

In [13]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

# Preparing training data from dataset

In [14]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

# Preparing training and testing data using TF-IDF transformation

In [15]:
tfidf_Vect = TfidfVectorizer()
X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_tfidf = tfidf_Vect.transform(twenty_test.data)

# Naive Bayes model training and fitting data

In [16]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Predicting using test data and calculating accuracy_score using NaiveBayes

In [17]:
predicted =clf.predict(X_test_tfidf)
score = metrics.accuracy_score(twenty_test.target, predicted)
print("accuracy score with multinomialNB",score)

accuracy score with multinomialNB 0.7738980350504514


# SVM model training

In [18]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train_tfidf, twenty_train.target)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

# Predicting and Calculating accuracy_score using SVM

In [19]:
predicted1 = classifier.predict(X_test_tfidf)
score1 = metrics.accuracy_score(twenty_test.target, predicted1)
print("accuracy score after applyingSVM",score1)

accuracy score after applyingSVM 0.8347052575677111


# change the tfidfvectorizer to use bigram 

In [21]:
bigram_tfidf_Vect = TfidfVectorizer(ngram_range=(2,2))

# Preparing training and testing using Bigram TFIDF

In [22]:
bigram_X_train_tfidf = bigram_tfidf_Vect.fit_transform(twenty_train.data)
bigram_X_test_tfidf = bigram_tfidf_Vect.transform(twenty_test.data)

# Applying NaiveBayes with Bigram TFIDF

In [23]:
bigram_clf = MultinomialNB()
bigram_clf.fit(bigram_X_train_tfidf, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Calculating accuracy using Bigram

In [24]:
predicted =bigram_clf.predict(bigram_X_test_tfidf)
score = metrics.accuracy_score(twenty_test.target, predicted)
print("accuracy score by applying bigram",score)

accuracy score by applying bigram 0.7327403080191184


# TFIDF with stopwords

In [25]:
from nltk.corpus import stopwords
stop_tfidf_Vect = TfidfVectorizer(stopwords.words('english'))

# Training and testing data

In [26]:
stop_X_train_tfidf = stop_tfidf_Vect.fit_transform(twenty_train.data)
stop_X_test_tfidf = stop_tfidf_Vect.transform(twenty_test.data)

# Applying NaiveBayes upon using TFIDF with stopwords

In [27]:
stop_clf = MultinomialNB()
stop_clf.fit(stop_X_train_tfidf, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Accuracy score with TFIDF stopwords

In [28]:
predicted =stop_clf.predict(stop_X_test_tfidf)
score = metrics.accuracy_score(twenty_test.target, predicted)
print("accuracy score by applying stopword",score)

accuracy score by applying stopword 0.7738980350504514
