In [49]:
from sklearn.datasets import fetch_20newsgroups
train_master=fetch_20newsgroups(subset='train', shuffle=1)
test_master=fetch_20newsgroups(subset='test',shuffle=1)

In [131]:
fetch_20newsgroups

<function sklearn.datasets.twenty_newsgroups.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)>

In [3]:
from string import punctuation

def clean_n_tokenize(doc):
    tokens=doc.split()
    table=str.maketrans('','',punctuation)
    tokens=[w.translate(table) for w in tokens]
    tokens=[w for w in tokens if w.isalpha()]
    tokens=[w for w in tokens if len(w)>2]
    return tokens

In [27]:
words=set()
vocab={}

for doc in train_master.data:
    tokens=clean_n_tokenize(doc)
    for t in tokens:
        if t in vocab:
            vocab[t]+=1
        else:
            vocab[t]=1

In [28]:
for w in vocab:
    if vocab[w]>10:
        words.add(w)

In [29]:
len(vocab)

113318

In [30]:

len(words)

16546

In [43]:
# Random testing.
tokenizer=Tokenizer()
tokenizer.fit_on_texts("This is a test line, for the tokenizer operation")
encoded_docs = tokenizer.texts_to_sequences("This is a test line, for the tokenizer operation")

In [45]:
tokens = "This is a test line, for the tokenizer operation".split()
tokens

['This', 'is', 'a', 'test', 'line,', 'for', 'the', 'tokenizer', 'operation']

In [47]:
#removing words from the trimmed corpus, i.e words whose overall frequency in the whole vocab is less than 10.
#also puts the line in array format, numpy is faster to work with.
train_trimmed=[]
for doc in train_master.data:
    selected_tokens=[]
    tokens=doc.split()
    for t in tokens:
        if t in words:
            selected_tokens.append(t)
    line=' '.join(selected_tokens)
    train_trimmed.append(line)
    
test_trimmed=[]
for doc in test_master.data:
    selected_tokens=[]
    tokens=doc.split()
    for t in tokens:
        if t in words:
            selected_tokens.append(t)
    line=' '.join(selected_tokens)
    test_trimmed.append(line)

In [69]:
#method to embed the text.

import numpy as np

def use_embedding(path, encoding):
    file=open(path,'r',encoding=encoding)
    lines=file.readlines()[1:]
    file.close()
    embedding=dict()
    for l in lines:
        words=l.split()
        embedding[words[0]]=np.asarray(words[1:],dtype='float32')
    return embedding

In [66]:
#enconde the words, then pad'em.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras

tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_trimmed)
train_encoded=tokenizer.texts_to_sequences(train_trimmed)
test_encoeded=tokenizer.texts_to_sequences(test_trimmed)

max_len=max([len(s.split()) for s in train_trimmed])

x_train=pad_sequences(train_encoded,padding='post', maxlen=max_len)
x_test=pad_sequences(test_encoeded,padding='post', maxlen=max_len)

y_train=keras.utils.to_categorical(train_master.target,num_classes=20)
y_test=keras.utils.to_categorical(test_master.target,num_classes=20)

In [88]:
vocab_size=len(tokenizer.word_index)+1
raw_embiddings=use_embedding('./glove.6B.100d.txt', 'utf8')

wt_mat=np.zeros((vocab_size,100))
for word,i in tokenizer.word_index.items():
    if word in raw_embiddings:
        wt_mat[i]=raw_embiddings[word]

In [100]:
from keras import layers
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

embedding_layer=keras.layers.Embedding(vocab_size, 100, weights=[wt_mat], input_length=max_len, trainable=1)

model=Sequential()
model.add(Embedding(vocab_size,100, input_length=max_len))
model.add(Conv1D(filters=16, kernel_size=16, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(20, activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 6966, 100)         1267100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 6951, 16)          25616     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 3475, 16)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 55600)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 55600)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1112020   
Total params: 2,404,736
Trainable params: 2,404,736
Non-trainable params: 0
____________________________________________

In [101]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=3, verbose=2, validation_data = (x_test,y_test))

Epoch 1/3
354/354 - 1700s - loss: 2.4703 - accuracy: 0.2034 - val_loss: 1.8505 - val_accuracy: 0.4016
Epoch 2/3
354/354 - 1139s - loss: 1.1537 - accuracy: 0.6358 - val_loss: 1.4070 - val_accuracy: 0.5786
Epoch 3/3
354/354 - 1165s - loss: 0.5213 - accuracy: 0.8485 - val_loss: 1.5275 - val_accuracy: 0.6174


<tensorflow.python.keras.callbacks.History at 0x120a241d0>

In [102]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9427
Testing Accuracy:  0.6174


In [125]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

  return f(*args, **kwds)


In [126]:
model_one=Pipeline([('tfidf_v',TfidfVectorizer()),('svm',SGDClassifier(loss='hinge',alpha=1e-3))])
model_one=model_one.fit(train_trimmed,train_master.target)

model_one_predict=model_one.predict(test_master.data)
np.mean(model_one_predict==test_master.target)

0.7813329792883696

In [128]:
from sklearn.model_selection import GridSearchCV
param_model_one={'tfidf_v__ngram_range':[(1, 1),(1,2)],
                  'tfidf_v__use_idf':(True,False),
                  'svm__alpha':(1e-2,1e-3)}
gs_model_one=GridSearchCV(model_one,param_model_one,n_jobs=-1)

gs_model_one=gs_model_one.fit(train_trimmed,train_master.target)



In [129]:
print(gs_model_one.best_score_)
print(gs_model_one.best_params_)

0.8645041541453067
{'svm__alpha': 0.01, 'tfidf_v__ngram_range': (1, 2), 'tfidf_v__use_idf': True}


In [133]:
text=['We are trying to learn NLP','NLP is fun and easy']

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(text)
vectorizer.vocabulary_

{'We': 1,
 'are': 3,
 'trying': 9,
 'to': 8,
 'learn': 7,
 'NLP': 0,
 'is': 6,
 'fun': 5,
 'and': 2,
 'easy': 4}

In [134]:
vectorizer.transform(text).toarray()

array([[1, 1, 0, 1, 0, 0, 0, 1, 1, 1],
       [1, 0, 1, 0, 1, 1, 1, 0, 0, 0]])