In [6]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [8]:
x_train = newsgroups_train.data
x_test = newsgroups_test.data


y_train = newsgroups_train.target
y_test = newsgroups_test.target


print ("List of all 20 categories:")
print (newsgroups_train.target_names)

List of all 20 categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [12]:
print ("\n")
print ("Sample Email:")
print (x_train[0])
print('y_train:-',y_train[0])



Sample Email:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





y_train:- 7


In [13]:
print ("Sample Target Category:")
print (y_train[0])
print (newsgroups_train.target_names[y_train[0]])

Sample Target Category:
7
rec.autos


# import nltk and preprocess libraries

In [39]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

nltk.download('punkt') ## error without this cmd
nltk.download('stopwords') 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [40]:
def preprocessing(text):
  text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
  tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
  tokens = [word.lower() for word in tokens]
  stopwds = stopwords.words('english')
  tokens = [token for token in tokens if token not in stopwds]
  tokens = [word for word in tokens if len(word)>=3]
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(word) for word in tokens]

  stemmer = PorterStemmer()
  tokens = [stemmer.stem(word) for word in tokens]

  tagged_corpus = pos_tag(tokens)
  Noun_tags = ['NN','NNP','NNPS','NNS']
  Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

  lemmatizer = WordNetLemmatizer()

  def prat_lemmatize(token,tag):
    if tag in Noun_tags:
      return lemmatizer.lemmatize(token,'n')
    elif tag in Verb_tags:
      return lemmatizer.lemmatize(token,'v')
    else:
      return lemmatizer.lemmatize(token,'n')

  pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])

  return pre_proc_text

# Applying pre-processing on train and test data:

In [41]:
x_train_preprocessed = []
for i in x_train:
  x_train_preprocessed.append(preprocessing(i))

In [42]:
x_train_preprocessed[0]

'lerxst wam umd edu thing subject car nntp post host rac3 wam umd edu organ univ maryland colleg park line wonder anyon could enlighten car saw day door sport car look late 60 earli 70 call bricklin door realli small addit front bumper separ rest bodi know anyon tellm model name engin spec year product car make histori whatev info funki look car plea mail thank bring neighborhood lerxst'

In [43]:
x_test_preprocessed = []
for i in x_test:
  x_test_preprocessed.append(preprocessing(i))

In [44]:
x_test_preprocessed[0]

'v064mb9k ubvmsd buffalo edu neil gandler subject need info bonnevil organ univ buffalo line news softwar vax vm vnew nntp post host ubvmsd buffalo edu littl confu model bonnevil heard lse sse ssei could someon tell differ far featur perform also curiou know book valu prefer model much le book valu usual get word much demand time year heard mid spring earli summer best time buy neil gandler'

# building TFIDF vectorizer

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),
stop_words='english', max_features= 10000,strip_accents='unicode',
norm='l2')

In [46]:
x_train_2 = vectorizer.fit_transform(x_train_preprocessed).todense()

x_test_2 = vectorizer.transform(x_test_preprocessed).todense()

# Deep Learning modules

In [47]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta,Adam,RMSprop
from keras.utils import np_utils

In [48]:
np.random.seed(1337)
nb_classes = 20
batch_size = 64
nb_epochs = 20

In [49]:
Y_train = np_utils.to_categorical(y_train, nb_classes)

#Deep Layer Model building in Keras

In [50]:
model = Sequential()
model.add(Dense(1000,input_shape= (10000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print (model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              10001000  
_________________________________________________________________
activation (Activation)      (None, 1000)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_1 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2

# Start training 

In [51]:
model.fit(x_train_2, Y_train, batch_size=batch_size,
epochs=nb_epochs,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fda37faaf90>

#Model Prediction


In [52]:
y_train_predclass = model.predict_classes(x_train_2,batch_size=batch_size)
y_test_predclass = model.predict_classes(x_test_2,batch_size=batch_size)
from sklearn.metrics import accuracy_score,classification_report



In [54]:
print ("\n\nDeep Neural Network - Train accuracy:"),
(round(accuracy_score( y_train, y_train_predclass),3))



Deep Neural Network - Train accuracy:


0.999

In [55]:
print ("\nDeep Neural Network - Test accuracy:")
(round(accuracy_score(y_test,y_test_predclass),3))


Deep Neural Network - Test accuracy:


0.808

In [56]:
print ("\nDeep Neural Network - Train Classification Report")
print (classification_report(y_train,y_train_predclass))


Deep Neural Network - Train Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       584
           2       1.00      1.00      1.00       591
           3       1.00      1.00      1.00       590
           4       1.00      1.00      1.00       578
           5       1.00      1.00      1.00       593
           6       1.00      0.99      1.00       585
           7       1.00      1.00      1.00       594
           8       1.00      1.00      1.00       598
           9       1.00      1.00      1.00       597
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       595
          12       1.00      1.00      1.00       591
          13       1.00      1.00      1.00       594
          14       1.00      1.00      1.00       593
          15       1.00      1.00      1.00       599
          16       1.00      1

In [57]:
print ("\nDeep Neural Network - Test Classification Report")
print (classification_report(y_test,y_test_predclass))


Deep Neural Network - Test Classification Report
              precision    recall  f1-score   support

           0       0.78      0.74      0.76       319
           1       0.71      0.71      0.71       389
           2       0.67      0.69      0.68       394
           3       0.65      0.70      0.68       392
           4       0.72      0.81      0.77       385
           5       0.80      0.74      0.77       395
           6       0.82      0.79      0.81       390
           7       0.90      0.85      0.87       396
           8       0.89      0.93      0.91       398
           9       0.88      0.91      0.89       397
          10       0.94      0.97      0.96       399
          11       0.92      0.91      0.91       396
          12       0.70      0.73      0.71       393
          13       0.89      0.79      0.83       396
          14       0.88      0.93      0.90       394
          15       0.88      0.85      0.86       398
          16       0.77      0.