In [2]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import pandas as pd

In [None]:
# TEXT_DATA_DIR = r'../input/20-newsgroup-original/20_newsgroup/20_newsgroup/'
#the path for Glove embeddings
GLOVE_DIR = r'./../tmp/glove/'
# make the max word length to be constant
MAX_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
# the percentage of train test split to be applied
VALIDATION_SPLIT = 0.50
# the dimension of vectors to be used
EMBEDDING_DIM = 100
# filter sizes of the different conv layers 
filter_sizes = [3,4,5]
num_filters = 512
embedding_dim = 100
# dropout probability
drop = 0.5
batch_size = 30
epochs = 50

In [120]:
df_cnn=pd.read_csv('./../data/preprocessed_nov_23_df_cnn_topic_combined.csv')
texts_cnn=df_cnn["clean_text"].tolist()
labels_cnn=df_cnn["topicEncoded"].to_list() 

In [121]:
print(len(texts_cnn))
print(len(labels_cnn))

1517
1517


In [122]:
tokenizer  = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(texts_cnn)
sequences =  tokenizer.texts_to_sequences(texts_cnn)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels_cnn = to_categorical(np.asarray(labels_cnn))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels_cnn.shape)
print(labels_cnn)

unique words : 6326
Shape of data tensor: (1517, 1000)
Shape of label tensor: (1517, 5)
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


In [123]:
indices = np.arange(data.shape[0])
print(indices)
np.random.shuffle(indices)
data = data[indices]
labels = labels_cnn[indices]

[   0    1    2 ... 1514 1515 1516]


In [124]:
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(data,labels, train_size=0.7)

In [125]:

nb_validation_samples = int(VALIDATION_SPLIT * X_rem.shape[0])
x_test = X_rem[:-nb_validation_samples]
y_test = y_rem[:-nb_validation_samples]
x_val = X_rem[-nb_validation_samples:]
y_val = y_rem[-nb_validation_samples:]

In [126]:
print(X_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

(1061, 1000)
(1061, 5)
(228, 1000)
(228, 5)
(228, 1000)
(228, 5)


In [None]:
import os
import zipfile
with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('.//tmp/glove')

In [127]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [128]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [129]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [130]:
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding = embedding_layer(inputs)

print(embedding.shape)
reshape = Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedding)
print(reshape.shape)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=5, activation='softmax')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


(None, 1000, 100)
(None, 1000, 100, 1)
Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 1000, 100)    632700      ['input_7[0][0]']                
                                                                                                  
 reshape_6 (Reshape)            (None, 1000, 100, 1  0           ['embedding_5[0][0]']            
                                )                                                                 
                                                                                                  
 conv2d_18 (Conv2D)             (None, 998, 1, 512)  

  super().__init__(name, **kwargs)


In [131]:
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(x_val, y_val))

Traning Model...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x197ac8fbc40>

# Training and Testing on Left

In [134]:
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)
print('Test Accuracy for training and testing on the cnn dataset: %f' % (test_acc*100))

8/8 - 1s - loss: 0.3551 - accuracy: 0.8860 - 1s/epoch - 126ms/step
Test Accuracy for training and testing on the cnn dataset: 88.596493


# Training on left and testing it on right

In [138]:
df_fox=pd.read_csv('./real_groundth_truth/preprocessed_nov_23_df_fox_topic_combined.csv')
texts_fox=df_fox["clean_text"].tolist()
labels_fox=df_fox["topicEncoded"].to_list()

df_reuters=pd.read_csv('./real_groundth_truth/preprocessed_nov_23_df_reuters_topic_combined.csv')
texts_reuters=df_reuters["clean_text"].tolist()
labels_reuters=df_reuters["topicEncoded"].to_list()

In [139]:
tokenizer  = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(texts_fox)
sequences =  tokenizer.texts_to_sequences(texts_fox)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels_fox = to_categorical(np.asarray(labels_fox))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels_fox.shape)
print(labels_fox)

unique words : 5376
Shape of data tensor: (1584, 1000)
Shape of label tensor: (1584, 5)
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


In [140]:
x_test=data
y_test=labels_fox

In [141]:
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)

50/50 - 8s - loss: 1.4853 - accuracy: 0.5808 - 8s/epoch - 150ms/step


# Training and testing on right data

In [142]:
df_fox=pd.read_csv('./real_groundth_truth/preprocessed_nov_23_df_fox_topic_combined.csv')
texts_fox=df_fox["clean_text"].tolist()
labels_fox=df_fox["topicEncoded"].to_list()

In [143]:
tokenizer  = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(texts_fox)
sequences =  tokenizer.texts_to_sequences(texts_fox)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels_fox = to_categorical(np.asarray(labels_fox))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels_fox.shape)
print(labels_fox)

unique words : 5376
Shape of data tensor: (1584, 1000)
Shape of label tensor: (1584, 5)
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


In [144]:
indices = np.arange(data.shape[0])
print(indices)
np.random.shuffle(indices)
data = data[indices]
labels = labels_fox[indices]

[   0    1    2 ... 1581 1582 1583]


In [145]:
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(data,labels, train_size=0.7)

In [146]:
nb_validation_samples = int(VALIDATION_SPLIT * X_rem.shape[0])
x_test = X_rem[:-nb_validation_samples]
y_test = y_rem[:-nb_validation_samples]
x_val = X_rem[-nb_validation_samples:]
y_val = y_rem[-nb_validation_samples:]

In [147]:
print(X_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

(1108, 1000)
(1108, 5)
(238, 1000)
(238, 5)
(238, 1000)
(238, 5)


In [148]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [149]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [150]:
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding = embedding_layer(inputs)

print(embedding.shape)
reshape = Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedding)
print(reshape.shape)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=5, activation='softmax')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


(None, 1000, 100)
(None, 1000, 100, 1)
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 1000, 100)    632700      ['input_8[0][0]']                
                                                                                                  
 reshape_7 (Reshape)            (None, 1000, 100, 1  0           ['embedding_5[1][0]']            
                                )                                                                 
                                                                                                  
 conv2d_21 (Conv2D)             (None, 998, 1, 512)  

  super().__init__(name, **kwargs)


In [151]:
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(x_val, y_val))

Traning Model...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x197b7fd2a70>

In [152]:
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)
print('Test Accuracy for training and testing on the cnn dataset: %f' % (test_acc*100))

8/8 - 1s - loss: 0.8536 - accuracy: 0.7605 - 1s/epoch - 184ms/step
Test Accuracy for training and testing on the cnn dataset: 76.050419


# Training on Right Data and Testing it on left data

In [153]:
df_cnn=pd.read_csv('./real_groundth_truth/preprocessed_nov_23_df_cnn_topic_combined.csv')
texts_cnn=df_cnn["clean_text"].tolist()
labels_cnn=df_cnn["topicEncoded"].to_list()

In [154]:
tokenizer  = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(texts_cnn)
sequences =  tokenizer.texts_to_sequences(texts_cnn)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels_cnn = to_categorical(np.asarray(labels_cnn))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels_cnn.shape)
print(labels_cnn)

unique words : 6326
Shape of data tensor: (1517, 1000)
Shape of label tensor: (1517, 5)
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


In [155]:
indices = np.arange(data.shape[0])
print(indices)
np.random.shuffle(indices)
data = data[indices]
labels = labels_cnn[indices]

[   0    1    2 ... 1514 1515 1516]


In [156]:
x_test=data
y_test=labels

In [158]:
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)
print('Test Accuracy for training on the fox dataset and testing on the cnn dataset: %f' % (test_acc*100))

48/48 - 8s - loss: 1.9979 - accuracy: 0.5346 - 8s/epoch - 167ms/step
Test Accuracy for training on the fox dataset and testing on the cnn dataset: 53.460777
