In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = "/root/data/"

df_fake = pd.read_csv(data_path + "Fake.csv")
df_true = pd.read_csv(data_path + "True.csv")
df_fake['label'] = 1  # Fake news label
df_true['label'] = 0   # Real news label
df_fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [2]:
df_true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [3]:
df = pd.concat([df_true,df_fake])
df = df.drop(['title','subject','date'],axis=1)
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Preprocess text
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    text = text.lower()                              # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)          # Remove special characters and numbers
    tokens = word_tokenize(text)                     # Tokenization
    stop_words = set(stopwords.words('english'))     # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatizer = WordNetLemmatizer()                 # Lemmatization  ex: running => run (not nesscessary, but i want to test)
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

glove_path = "/root/data/glove/"

df['clean_text'] = df['text'].apply(preprocess_text)
df.head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label,clean_text
0,WASHINGTON (Reuters) - The head of a conservat...,0,"[washington, reuters, head, conservative, repu..."
1,WASHINGTON (Reuters) - Transgender people will...,0,"[washington, reuters, transgender, people, all..."
2,WASHINGTON (Reuters) - The special counsel inv...,0,"[washington, reuters, special, counsel, invest..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0,"[washington, reuters, trump, campaign, adviser..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0,"[seattlewashington, reuters, president, donald..."


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


# Tokenization and padding
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100

# tokenizer turns texts to tokens
# e.g ['Some ThING to eat !', 'some thing to drink .']
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
# after fit_on_texts, it creates a word_index:[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]), showing the freq of the words
sequences = tokenizer.texts_to_sequences(df['clean_text'])
# turn text into a sequence of integers based on word_index dict 
# [[1, 2, 3, 4], [1, 2, 3, 5]]

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

# Makes sure that the sequences are of the same length
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = df['label'].values
labels = to_categorical(np.asarray(labels),num_classes = 2)

print(data.shape)
print(labels.shape)

# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
X_test, x_val, y_test, y_val = train_test_split( data, labels, test_size=0.50, random_state=42)

print('Size of train, validation, test:', len(y_train), len(y_val), len(y_test))

print('real & fake news in train,valt,test:')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))
print(y_test.sum(axis=0))

2024-05-21 07:18:15.135628: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-21 07:18:15.360731: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 212255 unique tokens.
(44898, 1000)
(44898, 2)
Size of train, validation, test: 35918 22449 22449
real & fake news in train,valt,test:
[17087. 18831.]
[10665. 11784.]
[10752. 11697.]


In [6]:
from tensorflow.keras.layers import Embedding

# loading GloVe
def load_glove_vectors(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = "/root/data/glove/"
embeddings_index = load_glove_vectors(glove_path + "glove.6B.100d.txt")

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            # weights=[embedding_matrix],
                            # input_length=MAX_SEQUENCE_LENGTH,
                            # input_shape=1000,
                            trainable=False)

In [7]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.layers import concatenate
from keras.models import Model
from keras.optimizers import Adadelta
# Define the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

convs = []
filter_sizes = [3, 4, 5]

for fsz in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=fsz, activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(pool_size=5)(l_conv)
    convs.append(l_pool)

l_merge = concatenate(convs, axis=1)
l_cov1 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(pool_size=5)(l_cov1)
l_cov2 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(pool_size=30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='sigmoid')(l_dense)

model2 = Model(sequence_input, preds)
model2.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

model2.summary()


In [8]:

# history2 = model2.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
history2 = model2.fit(X_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=50)

# Evaluate the model
loss, accuracy = model2.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Save the model
model2.save('model.h5')



Epoch 1/3
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 132ms/step - accuracy: 0.9131 - loss: 0.1657 - val_accuracy: 0.9988 - val_loss: 0.0056
Epoch 2/3
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 134ms/step - accuracy: 0.9994 - loss: 0.0028 - val_accuracy: 0.9989 - val_loss: 0.0090
Epoch 3/3
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 134ms/step - accuracy: 0.9994 - loss: 0.0025 - val_accuracy: 0.9988 - val_loss: 0.0100
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.9996 - loss: 7.1368e-04




Test Loss: 0.0004367835936136544
Test Accuracy: 0.9997327327728271


In [9]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.layers import concatenate
from keras.models import Model
from keras.optimizers import Adadelta
# Define the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

convs = []
filter_sizes = [3, 4, 5]

for fsz in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=fsz, activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(pool_size=5)(l_conv)
    convs.append(l_pool)

l_merge = concatenate(convs, axis=1)
l_cov1 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(pool_size=5)(l_cov1)
l_cov2 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(pool_size=30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='sigmoid')(l_dense)

model2 = Model(sequence_input, preds)
model2.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

model2.summary()


In [10]:

# history2 = model2.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
history2 = model2.fit(X_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=50)

# Evaluate the model
loss, accuracy = model2.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Save the model
model2.save('model.h5')



Epoch 1/3
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 133ms/step - accuracy: 0.9800 - loss: 0.0514 - val_accuracy: 0.9994 - val_loss: 0.0041
Epoch 2/3
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 229ms/step - accuracy: 0.9992 - loss: 0.0038 - val_accuracy: 0.9994 - val_loss: 0.0061
Epoch 3/3
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 110ms/step - accuracy: 1.0000 - loss: 8.0259e-05 - val_accuracy: 0.9987 - val_loss: 0.0124
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9994 - loss: 0.0026




Test Loss: 0.0022324284072965384
Test Accuracy: 0.9993763566017151


In [9]:
print(df['doc_vector'][0].to_list())

[array([-0.077617  ,  0.18136153,  0.26305065, -0.23296304,  0.13417992,
       -0.10997217, -0.26759976,  0.08891775, -0.04428137, -0.00249818,
       -0.0246676 , -0.05060842, -0.04418219, -0.07292591, -0.05181812,
       -0.2623384 ,  0.10799746,  0.07042778, -0.3686943 ,  0.04454444,
        0.19263002, -0.00511991,  0.08577151, -0.00627958, -0.24894513,
       -0.17845991, -0.18504605, -0.3723434 , -0.1552869 , -0.14986892,
        0.20181009,  0.40377313, -0.03158143, -0.01449121, -0.09444003,
        0.251681  , -0.01294147, -0.00443321, -0.09014698,  0.1217403 ,
       -0.43103194, -0.38282764,  0.20436859,  0.00753224, -0.15110278,
       -0.2562862 ,  0.02853767, -0.28838778, -0.16440338, -0.5995348 ,
        0.07711659, -0.19094424,  0.07214498,  0.7335864 , -0.04020729,
       -1.7825102 ,  0.2175383 , -0.30491725,  1.3524702 ,  0.39681745,
       -0.09299772,  0.18816169, -0.0870368 , -0.08841023,  0.5517362 ,
        0.03153479,  0.09992132,  0.39916977,  0.30359983, -0.1

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



knn_classifier = KNeighborsClassifier(n_neighbors=10)  
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_pred,y_test))
print('----------------')
print('Classification report')
print(classification_report(y_pred,y_test))


Accuracy: 0.9215708465368945
----------------
Confusion matrix
[[4135  531]
 [ 162 4008]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.96      0.89      0.92      4666
           1       0.88      0.96      0.92      4170

    accuracy                           0.92      8836
   macro avg       0.92      0.92      0.92      8836
weighted avg       0.92      0.92      0.92      8836



In [16]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

height = 100
width = 1
channels = 1

X_train = np.array(X_train)
X_test = np.array(X_test)

X_train = np.squeeze(X_train)
X_test = np.squeeze(X_test)


print(X_train.shape)
input_shape = X_train.shape[1:]
print(input_shape)

(35343, 100)
(100,)


In [14]:
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(100, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


history = model.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8517 - loss: 0.3444 - val_accuracy: 0.9219 - val_loss: 0.1994
Epoch 2/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9230 - loss: 0.1940 - val_accuracy: 0.9361 - val_loss: 0.1740
Epoch 3/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9272 - loss: 0.1835 - val_accuracy: 0.9225 - val_loss: 0.1946
Epoch 4/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9337 - loss: 0.1717 - val_accuracy: 0.9412 - val_loss: 0.1582
Epoch 5/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9392 - loss: 0.1572 - val_accuracy: 0.9423 - val_loss: 0.1550
Epoch 6/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9400 - loss: 0.1556 - val_accuracy: 0.9460 - val_loss: 0.1454
Epoch 7/50
[1m249/249[0m 

KeyboardInterrupt: 

In [82]:
y_pred_binary = np.round(y_pred).flatten()
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_test, y_pred_binary))
print('----------------')
print('Classification report')
print(classification_report(y_test, y_pred_binary))

----------------
Confusion matrix
[[4181  116]
 [ 207 4332]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4297
           1       0.97      0.95      0.96      4539

    accuracy                           0.96      8836
   macro avg       0.96      0.96      0.96      8836
weighted avg       0.96      0.96      0.96      8836



In [83]:
df_2nd = pd.read_csv(data_path +"2nd/" +"train.csv")
df_2nd = df_2nd.drop(['title','author','id'],axis=1)
df_2nd.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [84]:
df_2nd['text'] = df_2nd['text'].astype(str)
df_2nd['clean_text'] = df_2nd['text'].apply(preprocess_text)
df_2nd['doc_vector'] = df_2nd['clean_text'].apply(tokens_to_vectors)
df_2nd = df_2nd.dropna()
df_2nd.head()

Unnamed: 0,text,label,clean_text,doc_vector
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[house, dem, aide, didnt, even, see, comeys, l...","[-0.050807018, 0.04975615, 0.2658656, -0.32393..."
1,Ever get the feeling your life circles the rou...,0,"[ever, get, feeling, life, circles, roundabout...","[-0.0006217707, 0.215978, 0.294425, -0.0798217..."
2,"Why the Truth Might Get You Fired October 29, ...",1,"[truth, might, get, fired, october, tension, i...","[-0.041612357, 0.21105361, 0.30817404, -0.0764..."
3,Videos 15 Civilians Killed In Single US Airstr...,1,"[videos, civilians, killed, single, us, airstr...","[-0.1113405, 0.11081573, 0.092530034, -0.14825..."
4,Print \nAn Iranian woman has been sentenced to...,1,"[print, iranian, woman, sentenced, six, years,...","[0.19707523, 0.09317374, 0.060490813, -0.18551..."


In [85]:
X_test = df_2nd['doc_vector'].to_list() 
y_test = df_2nd['label']

X_test = np.asarray(X_test).astype(np.float32)

loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m149/646[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 681us/step - accuracy: 0.6012 - loss: 2.0298

[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 653us/step - accuracy: 0.6001 - loss: 2.0123
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 687us/step
Test Loss: 1.9965863227844238
Test Accuracy: 0.5989638566970825


In [86]:
y_pred_binary = np.round(y_pred).flatten()
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_test, y_pred_binary))
print('----------------')
print('Classification report')
print(classification_report(y_test, y_pred_binary))

----------------
Confusion matrix
[[3565 6821]
 [1462 8806]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.71      0.34      0.46     10386
           1       0.56      0.86      0.68     10268

    accuracy                           0.60     20654
   macro avg       0.64      0.60      0.57     20654
weighted avg       0.64      0.60      0.57     20654



In [87]:
X_train, X_test, y_train, y_test = train_test_split(df_2nd['doc_vector'].to_list() , df_2nd['label'], test_size=0.2, random_state=42)

X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

history2 = model.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7151 - loss: 0.7146 - val_accuracy: 0.7925 - val_loss: 0.4445
Epoch 2/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8039 - loss: 0.4261 - val_accuracy: 0.8234 - val_loss: 0.3847
Epoch 3/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8279 - loss: 0.3875 - val_accuracy: 0.8318 - val_loss: 0.3783
Epoch 4/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8336 - loss: 0.3723 - val_accuracy: 0.8409 - val_loss: 0.3657
Epoch 5/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8472 - loss: 0.3545 - val_accuracy: 0.8397 - val_loss: 0.3557
Epoch 6/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8497 - loss: 0.3490 - val_accuracy: 0.8548 - val_loss: 0.3391
Epoch 7/50
[1m117/117[0m 

In [88]:
y_pred_binary = np.round(y_pred).flatten()
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_test, y_pred_binary))
print('----------------')
print('Classification report')
print(classification_report(y_test, y_pred_binary))

----------------
Confusion matrix
[[1878  217]
 [ 285 1751]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      2095
           1       0.89      0.86      0.87      2036

    accuracy                           0.88      4131
   macro avg       0.88      0.88      0.88      4131
weighted avg       0.88      0.88      0.88      4131



In [89]:
def single_vectorize(input_text):
    inp_df = pd.DataFrame([input_text],columns=["text"])
    inp_df["text"]=inp_df["text"].apply(preprocess_text)
    inp_df["text"]=inp_df["text"].apply(tokens_to_vectors)
    return inp_df["text"].to_list()

test_text = """
House Dem Aide: We Didn? Even See Comey? Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) 
With apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week?BI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton? email server, the ranking Democrats on the relevant committees didn? hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. 
As we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emails it had recently discovered in order to see if they contained classified information. Not long after this letter went out, Oversight Committee Chairman Jason Chaffetz set the political world ablaze with this tweet. FBI Dir just informed me, "The FBI has learned of the existence of emails that appear to be pertinent to the investigation." Case reopened 
??Jason Chaffetz (@jasoninthehouse) October 28, 2016 
Of course, we now know that this was not the case . Comey was actually saying that it was reviewing the emails in light of ?n unrelated case?hich we now know to be Anthony Weiner? sexting with a teenager. But apparently such little things as facts didn? matter to Chaffetz. The Utah Republican had already vowed to initiate a raft of investigations if Hillary wins?t least two years??worth, and possibly an entire term? worth of them. Apparently Chaffetz thought the FBI was already doing his work for him?esulting in a tweet that briefly roiled the nation before cooler heads realized it was a dud. 
But according to a senior House Democratic aide, misreading that letter may have been the least of Chaffetz??sins. That aide told Shareblue that his boss and other Democrats didn? even know about Comey? letter at the time?nd only found out when they checked Twitter. ?emocratic Ranking Members on the relevant committees didn? receive Comey? letter until after the Republican Chairmen. In fact, the Democratic Ranking Members didn??receive it until after the Chairman of the Oversight and Government Reform Committee, Jason Chaffetz, tweeted it out and made it public.??
So let? see if we?e got this right. The FBI director tells Chaffetz and other GOP committee chairmen about a major development in a potentially politically explosive investigation, and neither Chaffetz nor his other colleagues had the courtesy to let their Democratic counterparts know about it. Instead, according to this aide, he made them find out about it on Twitter. 
There has already been talk on Daily Kos that Comey himself provided advance notice of this letter to Chaffetz and other Republicans, giving them time to turn on the spin machine. That may make for good theater, but there is nothing so far that even suggests this is the case. After all, there is nothing so far that suggests that Comey was anything other than grossly incompetent and tone-deaf. 
What it does suggest, however, is that Chaffetz is acting in a way that makes Dan Burton and Darrell Issa look like models of responsibility and bipartisanship. He didn? even have the decency to notify ranking member Elijah Cummings about something this explosive. If that doesn? trample on basic standards of fairness, I don? know what does. 
Granted, it? not likely that Chaffetz will have to answer for this. He sits in a ridiculously Republican district anchored in Provo and Orem; it has a Cook Partisan Voting Index of R+25, and gave Mitt Romney a punishing 78 percent of the vote in 2012. Moreover, the Republican House leadership has given its full support to Chaffetz??planned fishing expedition. But that doesn? mean we can? turn the hot lights on him. After all, he is a textbook example of what the House has become under Republican control. And he is also the Second Worst Person in the World. About Darrell Lucus 
Darrell is a 30-something graduate of the University of North Carolina who considers himself a journalist of the old school. An attempt to turn him into a member of the religious right in college only succeeded in turning him into the religious right's worst nightmare--a charismatic Christian who is an unapologetic liberal. His desire to stand up for those who have been scared into silence only increased when he survived an abusive three-year marriage. You may know him on Daily Kos as Christian Dem in NC . Follow him on Twitter @DarrellLucus or connect with him on Facebook . Click here to buy Darrell a Mello Yello. Connect
"""


single_input = single_vectorize(test_text)  
single_input_reshaped = np.reshape(single_input, (1, 100))
single_input_reshaped = np.asarray(single_input_reshaped).astype(np.float32)
prediction = model.predict(single_input_reshaped)

print(prediction)

predicted_class = 1 if prediction > 0.5 else 0
print("Predicted Class:", predicted_class)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[[0.71298003]]
Predicted Class: 1
