<h3> This file contains code for various multiclass classification models with Non negative Matrix Factorization and Latent Semantic Indexing<br>
Code By:<br>
Konark J S Kumar - 204759469<br>
Shreyas Lakhe - 105026650
</h3>

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.utils import shuffle
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, roc_curve, precision_score, recall_score
import scikitplot as skplt
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

In [4]:
classes = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']

In [5]:
train_x = fetch_20newsgroups(subset='train', categories=classes, shuffle=True, random_state=42)
test_x = fetch_20newsgroups(subset='test', categories=classes, shuffle=True, random_state=42)
train_y = train_x.target
test_y = test_x.target

In [6]:
def preprocess_data(data):
        letters_only = re.sub("[^a-zA-Z]", " ", data)
        words = letters_only.split()
        ps = PorterStemmer()
        words = [ps.stem(w).lower() for w in words if not w.lower() in stopwords.words('english')]
        return(" ".join( words ))

In [7]:
preproc_train_data = []

for data in train_x.data:
	preproc_train_data.append(preprocess_data(data))

In [8]:
preproc_test_data = []

for data in test_x.data:
	preproc_test_data.append(preprocess_data(data))

In [9]:
vectorizer = CountVectorizer(min_df = 2)
tfidf_transformer = TfidfTransformer()

In [10]:
vec_train_x = vectorizer.fit_transform(preproc_train_data)
tfidf_train_x = tfidf_transformer.fit_transform(vec_train_x)

In [11]:
vec_test_x = vectorizer.transform(preproc_test_data)
tfidf_test_x = tfidf_transformer.transform(vec_test_x)

<h2>Classification after Non-Negative Matrix Factorization (NMF) reduction</h2>

In [12]:
nmf_model = NMF(n_components=50, init='random', random_state=0)

In [13]:
train_x = nmf_model.fit_transform(tfidf_train_x)
test_x = nmf_model.transform(tfidf_test_x)

<h3>Naive Bayes Classifier (One vs One)</h3>

In [14]:
classifier_NB = MultinomialNB().fit(train_x, train_y)

In [15]:
y_predict_nb = classifier_NB.predict(test_x)
y_predict_nb_prob = classifier_NB.predict_proba(test_x)

In [16]:
#skplt.metrics.plot_roc_curve(test_y, y_predict_nb_prob)
#plt.show()

In [17]:
confusion_matrix(test_y, y_predict_nb)

array([[316,  28,  44,   4],
       [108, 235,  38,   4],
       [ 56,  14, 315,   5],
       [  1,   0,   2, 395]])

In [18]:
print("Accuracy : ", accuracy_score(test_y, y_predict_nb))
#print("Precision : ",precision_score(test_y, y_predict_nb))
#print("Recall : ",recall_score(test_y, y_predict_nb))

print(classification_report(test_y, y_predict_nb, target_names=classes))

('Accuracy : ', 0.80575079872204469)
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.66      0.81      0.72       392
   comp.sys.mac.hardware       0.85      0.61      0.71       385
            misc.forsale       0.79      0.81      0.80       390
  soc.religion.christian       0.97      0.99      0.98       398

             avg / total       0.82      0.81      0.80      1565



<h3>Naive Bayes (One vs Rest) </h3>

In [19]:
classifier_onevsrest = OneVsRestClassifier(classifier_NB)

classifier_onevsrest.fit(train_x, train_y)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1)

In [20]:
y_predict_onevsrest = classifier_onevsrest.predict(test_x)
y_predict_onevsrest_prob = classifier_onevsrest.predict_proba(test_x)

In [21]:
confusion_matrix(test_y, y_predict_onevsrest)

array([[310,  36,  41,   5],
       [ 99, 243,  36,   7],
       [ 51,  17, 312,  10],
       [  0,   0,   1, 397]])

In [22]:
print("Accuracy : ", accuracy_score(test_y, y_predict_onevsrest))

print(classification_report(test_y, y_predict_onevsrest, target_names=classes))

('Accuracy : ', 0.80638977635782749)
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.67      0.79      0.73       392
   comp.sys.mac.hardware       0.82      0.63      0.71       385
            misc.forsale       0.80      0.80      0.80       390
  soc.religion.christian       0.95      1.00      0.97       398

             avg / total       0.81      0.81      0.80      1565



<h3>SVM Multiclass Classifier (OneVsOne)</h3>

In [23]:
classifier_onevsone = svm.SVC(C=1000., probability=True, kernel='linear')
classifier_onevsone.fit(train_x, train_y)

SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
y_predict_onevsone = classifier_onevsone.predict(test_x)
y_predict_onevsone_prob = classifier_onevsone.predict_proba(test_x)

In [25]:
confusion_matrix(test_y, y_predict_onevsone)

array([[309,  60,  21,   2],
       [ 68, 287,  28,   2],
       [ 23,  23, 342,   2],
       [  6,   3,   2, 387]])

In [26]:
print("Accuracy : ", accuracy_score(test_y, y_predict_onevsone))

print(classification_report(test_y, y_predict_onevsone, target_names=classes))

('Accuracy : ', 0.84664536741214058)
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.76      0.79      0.77       392
   comp.sys.mac.hardware       0.77      0.75      0.76       385
            misc.forsale       0.87      0.88      0.87       390
  soc.religion.christian       0.98      0.97      0.98       398

             avg / total       0.85      0.85      0.85      1565



<h3>SVM MultiClass Classifier (OneVsRest) </h3>

In [27]:
classifier_onevsall = OneVsRestClassifier(svm.SVC(C=1000., probability=True, kernel='linear'))
classifier_onevsall.fit(train_x, train_y)

OneVsRestClassifier(estimator=SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [28]:
y_predict_onevsall = classifier_onevsall.predict(test_x)

In [29]:
confusion_matrix(test_y, y_predict_onevsall)

array([[312,  50,  26,   4],
       [ 60, 295,  28,   2],
       [ 23,  21, 343,   3],
       [  2,   1,   1, 394]])

In [30]:
print("Accuracy : ", accuracy_score(test_y, y_predict_onevsall))
print(classification_report(test_y, y_predict_onevsall, target_names=classes))

('Accuracy : ', 0.85878594249201279)
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.79      0.80      0.79       392
   comp.sys.mac.hardware       0.80      0.77      0.78       385
            misc.forsale       0.86      0.88      0.87       390
  soc.religion.christian       0.98      0.99      0.98       398

             avg / total       0.86      0.86      0.86      1565



<h2>Classification after Latent Semantic Indexing(LSI) reduction</h2>

In [31]:
svd_model = TruncatedSVD(n_components=50, random_state=0)

In [32]:
train_x_lsa = svd_model.fit_transform(tfidf_train_x)
test_x_lsa = svd_model.transform(tfidf_test_x)

<h3>SVM Multiclass Classifier (OneVsOne)</h3>

In [34]:
classifier_onevsone_lsa = svm.SVC(C=1000., probability=True, kernel='linear')
classifier_onevsone_lsa.fit(train_x_lsa, train_y)

SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
y_predict_onevsone_lsa = classifier_onevsone_lsa.predict(test_x_lsa)
y_predict_onevsone_prob_lsa = classifier_onevsone_lsa.predict_proba(test_x_lsa)

In [36]:
confusion_matrix(test_y, y_predict_onevsone_lsa)

array([[325,  45,  22,   0],
       [ 38, 318,  28,   1],
       [ 22,  17, 350,   1],
       [  5,   0,   3, 390]])

In [37]:
print("Accuracy : ", accuracy_score(test_y, y_predict_onevsone_lsa))

print(classification_report(test_y, y_predict_onevsone_lsa, target_names=classes))

('Accuracy : ', 0.88370607028753989)
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.83      0.83      0.83       392
   comp.sys.mac.hardware       0.84      0.83      0.83       385
            misc.forsale       0.87      0.90      0.88       390
  soc.religion.christian       0.99      0.98      0.99       398

             avg / total       0.88      0.88      0.88      1565



<h3>SVM MultiClass Classifier (OneVsRest) </h3>

In [38]:
classifier_onevsall_lsa = OneVsRestClassifier(svm.SVC(C=1000., probability=True, kernel='linear'))
classifier_onevsall_lsa.fit(train_x_lsa, train_y)

OneVsRestClassifier(estimator=SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [39]:
y_predict_onevsall_lsa = classifier_onevsall_lsa.predict(test_x_lsa)

In [40]:
confusion_matrix(test_y, y_predict_onevsall_lsa)

array([[320,  53,  17,   2],
       [ 35, 323,  27,   0],
       [ 22,  17, 348,   3],
       [  4,   0,   1, 393]])

In [41]:
print("Accuracy : ", accuracy_score(test_y, y_predict_onevsall_lsa))
print(classification_report(test_y, y_predict_onevsall_lsa, target_names=classes))

('Accuracy : ', 0.88434504792332269)
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.84      0.82      0.83       392
   comp.sys.mac.hardware       0.82      0.84      0.83       385
            misc.forsale       0.89      0.89      0.89       390
  soc.religion.christian       0.99      0.99      0.99       398

             avg / total       0.88      0.88      0.88      1565

