In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Flatten, Conv2D, MaxPool2D, Conv1D, MaxPool1D, GlobalAveragePooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import re
%matplotlib inline

Using TensorFlow backend.


In [5]:
data1 = pd.read_csv("political.csv")

In [6]:
data2 = pd.read_csv("genuine.csv")

In [7]:
data1Sample = data1.sample(frac = 0.1)
data2Sample = data2.sample(frac = 0.1)

In [8]:
dataFull = data1Sample.append(data2Sample, ignore_index=True, sort=False)

In [9]:
dataFull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 3 columns):
content    14850 non-null object
label      14850 non-null int64
count      14850 non-null int64
dtypes: int64(2), object(1)
memory usage: 348.1+ KB


In [10]:
X = dataFull.content.astype(str)
Y = dataFull.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [11]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
x_train = []
for each in X_train:
    x_train.append(each)
x_test = []
for each in X_test:
    x_test.append(each)

In [12]:
encoded_matrix = []
for i in range(0,len(x_train)):
    x_train[i] = x_train[i].lower()
    x_train[i] = re.sub("[^a-z]", "", x_train[i])
    sentence = x_train[0]
    encoding = np.zeros((150,26))
    for i in range(0,len(sentence)):
        encoding[i][ord(sentence[i])-97] = 1
    encoded_matrix.append(encoding)

In [13]:
matrix = np.stack(encoded_matrix)
matrix = matrix.reshape(matrix.shape[0], 150,26, 1)

In [14]:
np.shape(matrix)

(12622, 150, 26, 1)

In [15]:
def CharCNN():
    inputs = Input(name='inputs',shape=[150,26,1])
    layer = Conv2D(32, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=[150,26]) (inputs)
    layer = MaxPool2D(pool_size=(2, 2), strides=(2, 2)) (layer)
    layer = Conv2D(64, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=[150,26]) (inputs)
    layer = MaxPool2D(pool_size=(2, 2), strides=(1, 1)) (layer)
    layer = Flatten() (layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [16]:
model = CharCNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150, 26, 1)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 146, 22, 64)       1664      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 145, 21, 64)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 194880)            0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 194881    
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 196,545
Trainable params: 196,545
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 10097 samples, validate on 2525 samples
Epoch 1/10
Epoch 2/10


<keras.callbacks.History at 0x7f5ac2204fd0>

In [36]:
encoded_matrix[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
encoded_matrix = []
for i in range(0,len(x_test)):
    x_test[i] = x_test[i].lower()
    x_test[i] = re.sub("[^a-z]", "", x_test[i])
    sentence = x_test[0]
    encoding = np.zeros((150,26))
    for i in range(0,len(sentence)):
        encoding[i][ord(sentence[i])-97] = 1
    encoded_matrix.append(encoding)

In [22]:
test_matrix = np.stack(encoded_matrix)
test_matrix = test_matrix.reshape(test_matrix.shape[0], 150,26, 1)

In [24]:
accr = model.evaluate(test_matrix,Y_test)



In [25]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.693
  Accuracy: 0.387
