<a href="https://colab.research.google.com/github/singhmousam/DocumentClassificationUsingOCR/blob/master/bbcTextClassificationIncremental.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path
from keras.models import load_model
import os

Using TensorFlow backend.


In [0]:
data = pd.read_csv('bbc-text.csv')

In [0]:
set(data['category'])

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [0]:
# seperating data on basis of response variable
#set1 = data[(data['category'] == 'entertainment') | (data['category'] == 'sport') | (data['category'] == 'tech')]
#set2 = data[(data['category'] == 'politics') | (data['category'] == 'business')]

In [0]:
# dividing whole data into sets
set1 = data[0:500]
set2 = data[500:1000]
set3 = data[1000:]

## Training on Set1

In [0]:
text_1 = set1['text']
tags_1 = set1['category']

In [0]:
set(tags_1)

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [0]:
num_labels = 5
vocab_size = 50000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text_1)
 
x_train_1 = tokenizer.texts_to_matrix(text_1, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(tags_1)

y_train_1 = encoder.transform(tags_1)

In [0]:
encoder.classes_

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype='<U13')

In [0]:
# model definition
model1 = Sequential()
model1.add(Dense(512, input_shape=(vocab_size,)))
model1.add(Activation('relu'))
model1.add(Dropout(0.3))
model1.add(Dense(512))
model1.add(Activation('relu'))
model1.add(Dropout(0.3))
model1.add(Dense(num_labels))
model1.add(Activation('softmax'))
model1.summary()
 
model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model1.fit(x_train_1, y_train_1,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               25600512  
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)  

In [0]:
model1.layers[:-2]

[<keras.layers.core.Dense at 0x7f056dce37f0>,
 <keras.layers.core.Activation at 0x7f056dce3390>,
 <keras.layers.core.Dropout at 0x7f05640d56a0>,
 <keras.layers.core.Dense at 0x7f05640d5438>,
 <keras.layers.core.Activation at 0x7f05640d5b38>,
 <keras.layers.core.Dropout at 0x7f0564081e10>]

In [0]:
# freezing layers
for layer in model1.layers[:-2]:
    layer.trainable = False

In [0]:
model1.save('bbcmodel1')

In [0]:
text_labels = encoder.classes_

## Testing on Set1

In [0]:
count = 0
for i in range(500):
    prediction = model1.predict(np.array([x_train_1[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_1.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_1.iloc[i])
        #print("Predicted label: " + predicted_label)
count

3

## Testing on Set2

In [0]:
text_2 = set2['text']
tags_2 = set2['category']

In [0]:
x_train_2 = tokenizer.texts_to_matrix(text_2, mode='tfidf')
encoder.fit(tags_2)
y_train_2 = encoder.transform(tags_2)

In [0]:
text_labels = encoder.classes_
text_labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype='<U13')

In [0]:
# Set2
count = 0
for i in range(500):
    prediction = model1.predict(np.array([x_train_2[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_2.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_2.iloc[i])
        #print("Predicted label: " + predicted_label)
count

28

## Model trained on set2

In [0]:
# model definition
modelset2 = Sequential()
modelset2.add(Dense(512, input_shape=(vocab_size,)))
modelset2.add(Activation('relu'))
modelset2.add(Dropout(0.3))
modelset2.add(Dense(512))
modelset2.add(Activation('relu'))
modelset2.add(Dropout(0.3))
modelset2.add(Dense(num_labels)) # num_labels
modelset2.add(Activation('softmax'))
modelset2.summary()
 
modelset2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = modelset2.fit(x_train_2, y_train_2,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 512)               25600512  
_________________________________________________________________
activation_7 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_8 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 2565      
__________

## Re-Training on set2

In [0]:
model2 = load_model('bbcmodel1')



In [0]:
model2.fit(x_train_2, y_train_2,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Train on 450 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7eff570595c0>

In [0]:
model2.save('bbcmodel2')

### Predicting model2 on set2 and set1

In [0]:
# Set2
count = 0
for i in range(500):
    prediction = model2.predict(np.array([x_train_2[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_2.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_2.iloc[i])
        #print("Predicted label: " + predicted_label)
count

12

In [0]:
# Set1
count = 0
for i in range(500):
    prediction = model2.predict(np.array([x_train_1[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_1.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_1.iloc[i])
        #print("Predicted label: " + predicted_label)
count

2

### Predicting modelset2 on SET 1 and SET 2

In [0]:
# Set2
count = 0
for i in range(500):
    prediction = modelset2.predict(np.array([x_train_2[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_2.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_2.iloc[i])
        #print("Predicted label: " + predicted_label)
count

2

In [0]:
# Set1
count = 0
for i in range(500):
    prediction = modelset2.predict(np.array([x_train_1[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_1.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_1.iloc[i])
        #print("Predicted label: " + predicted_label)
count

17

## Testing on set3 using both models

In [0]:
text_3 = set3['text']
tags_3 = set3['category']

In [0]:
x_train_3 = tokenizer.texts_to_matrix(text_3, mode='tfidf')
y_train_3 = encoder.transform(tags_3)

In [0]:
count = 0
for i in range(1000):
    prediction = model1.predict(np.array([x_train_3[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_3.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_3.iloc[i])
        #print("Predicted label: " + predicted_label)
count

52

In [0]:
count = 0
for i in range(1000):
    prediction = model2.predict(np.array([x_train_3[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_3.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_3.iloc[i])
        #print("Predicted label: " + predicted_label)
count

38

In [0]:
count = 0
for i in range(1000):
    prediction = modelset2.predict(np.array([x_train_3[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_3.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_3.iloc[i])
        #print("Predicted label: " + predicted_label)
count

40

## Retraining the model on set3 of data

In [0]:
model3 = load_model('bbcmodel2')

In [0]:
model3.fit(x_train_3, y_train_3,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Train on 1102 samples, validate on 123 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7eff55fe00b8>

## prediction of model3 on all sets 

In [0]:
# set3
count = 0
for i in range(1000):
    prediction = model3.predict(np.array([x_train_3[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_3.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_3.iloc[i])
        #print("Predicted label: " + predicted_label)
count

33

In [0]:
# SET2
count = 0
for i in range(499):
    prediction = model3.predict(np.array([x_train_2[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_2.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_3.iloc[i])
        #print("Predicted label: " + predicted_label)
count

17

In [0]:
# SET1
count = 0
for i in range(499):
    prediction = model3.predict(np.array([x_train_1[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_1.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_3.iloc[i])
        #print("Predicted label: " + predicted_label)
count

0

# Preparing data

In [0]:
set1 = data[(data['category'] == 'entertainment') | (data['category'] == 'sport') | (data['category'] == 'tech') | (data['category'] == 'business')]
set2 = data[(data['category'] == 'politics') | (data['category'] == 'business') | (data['category'] == 'tech') | (data['category'] == 'sport')]

In [0]:
text_1 = set1['text']
tags_1 = set1['category']

In [0]:
num_labels = 5
vocab_size = 50000
batch_size = 100
 
# define Tokenizer with Vocab Size
#tokenizer.fit_on_texts(text_1)
 
x_train_1 = tokenizer.texts_to_matrix(text_1, mode='tfidf')
 
#encoder = LabelBinarizer()
#encoder.fit(tags_1)

y_train_1 = encoder.transform(tags_1)

In [0]:
# model definition
model1 = Sequential()
model1.add(Dense(512, input_shape=(vocab_size,)))
model1.add(Activation('relu'))
model1.add(Dropout(0.3))
model1.add(Dense(512))
model1.add(Activation('relu'))
model1.add(Dropout(0.3))
model1.add(Dense(num_labels))
model1.add(Activation('softmax'))
model1.summary()
 
model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model1.fit(x_train_1, y_train_1,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

In [0]:
text_labels = encoder.classes_
text_labels

In [0]:
# prediction on set1
count = 0
for i in range(500):
    prediction = modelset2.predict(np.array([x_train_1[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_1.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_1.iloc[i])
        #print("Predicted label: " + predicted_label)
count

In [0]:
text_2 = set2['text']
tags_2 = set2['category']

In [0]:
x_train_2 = tokenizer.texts_to_matrix(text_2, mode='tfidf')
#encoder = LabelBinarizer()
#encoder.fit(tags_2)
y_train_2 = encoder.transform(tags_2)

In [0]:
text_labels = encoder.classes_

In [0]:
# Set2
count = 0
for i in range(500):
    prediction = modelset2.predict(np.array([x_train_2[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_2.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_2.iloc[i])
        #print("Predicted label: " + predicted_label)
count

In [0]:
model1.fit(x_train_2, y_train_2,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

In [0]:
# model definition
modelset2 = Sequential()
modelset2.add(Dense(512, input_shape=(vocab_size,)))
modelset2.add(Activation('relu'))
modelset2.add(Dropout(0.3))
modelset2.add(Dense(512))
modelset2.add(Activation('relu'))
modelset2.add(Dropout(0.3))
modelset2.add(Dense(num_labels)) # num_labels
modelset2.add(Activation('softmax'))
modelset2.summary()
 
modelset2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = modelset2.fit(x_train_2, y_train_2,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

# Trial sklearn library

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [0]:
y_train_1[0]

array([0, 0, 0, 0, 1])

In [0]:
x_train_1.shape

(500, 50000)

In [0]:
clfobj = MultinomialNB()
ovr = OneVsRestClassifier(clfobj)

In [0]:
clf = ovr.fit(x_train_1,y_train_1)

In [0]:
hex(id(clf))

'0x7f0567d5d6d8'

In [0]:
clf3 = clf.fit(x_train_2,y_train_2)

In [0]:
hex(id(clf3))

'0x7f0567d5d6d8'

In [0]:
# Set2
count = 0
for i in range(500):
    prediction = clf.predict(np.array([x_train_1[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_1.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_2.iloc[i])
        #print("Predicted label: " + predicted_label)
count

26

In [0]:
# Set2
count = 0
for i in range(500):
    prediction = clf.predict(np.array([x_train_2[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    if tags_2.iloc[i] != predicted_label:
        count+=1
        #print('Actual label:' + tags_2.iloc[i])
        #print("Predicted label: " + predicted_label)
count

2

In [0]:
clf2 = ovr.fit(x_train_2,y_train_2)