In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = "/root/data/"

df_fake = pd.read_csv(data_path + "Fake.csv")
df_true = pd.read_csv(data_path + "True.csv")
df_fake['label'] = 1  # Fake news label
df_true['label'] = 0   # Real news label
df = pd.concat([df_true,df_fake])
df = df.drop(['title','subject','date'],axis=1)


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Preprocess text
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if not isinstance(text, str):
        text = ''
    text = text.lower()                              # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)          # Remove special characters and numbers
    tokens = word_tokenize(text)                     # Tokenization
    stop_words = set(stopwords.words('english'))     # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatizer = WordNetLemmatizer()                 # Lemmatization  ex: running => run (not nesscessary, but i want to test)
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

glove_path = "/root/data/glove/"

df['clean_text'] = df['text'].apply(preprocess_text)
df.head()

"""
這裡看起來是處裡文字的函式，將文字tokenize
"""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'\n這裡看起來是處裡文字的函式，將文字tokenize\n'

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


# Tokenization and padding
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100

# tokenizer turns texts to tokens
# e.g ['Some ThING to eat !', 'some thing to drink .']
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
# after fit_on_texts, it creates a word_index:[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]), showing the freq of the words
sequences = tokenizer.texts_to_sequences(df['clean_text'])
# turn text into a sequence of integers based on word_index dict 
# [[1, 2, 3, 4], [1, 2, 3, 5]]

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

# Makes sure that the sequences are of the same length
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = df['label'].values

print(data.shape)
print(labels.shape)

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


2024-05-28 11:06:18.425466: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 11:06:18.445180: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 212255 unique tokens.
(44898, 1000)
(44898,)


In [4]:
from tensorflow.keras.layers import Embedding

# loading GloVe
def load_glove_vectors(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = "/root/data/glove/"
embeddings_index = load_glove_vectors(glove_path + "glove.6B.100d.txt")

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            # weights=[embedding_matrix],
                            # input_length=MAX_SEQUENCE_LENGTH,
                            # input_shape=1000,
                            trainable=False)

In [5]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.layers import concatenate
from keras.models import Model
from keras.optimizers import Adadelta
# Define the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

convs = []
filter_sizes = [3, 4, 5]

for fsz in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=fsz, activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(pool_size=5)(l_conv)
    convs.append(l_pool)

l_merge = concatenate(convs, axis=1)
l_cov1 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(pool_size=5)(l_cov1)
l_cov2 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(pool_size=30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(1, activation='sigmoid')(l_dense)


model2 = Model(sequence_input, preds)
model2.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

model2.summary()


In [6]:
X_train = np.array(X_train)
X_test = np.array(X_test)

X_train = np.squeeze(X_train)
X_test = np.squeeze(X_test)

print(y_test)


# history2 = model2.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
history2 = model2.fit(X_train, y_train, validation_split = 0.1, epochs=3, batch_size=50)

# # Evaluate the model
loss, accuracy = model2.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# # Save the model
# model2.save('model.h5')



[1 1 1 ... 1 0 0]
Epoch 1/3
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 107ms/step - accuracy: 0.8945 - loss: 0.1772 - val_accuracy: 0.9989 - val_loss: 0.0053
Epoch 2/3
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 106ms/step - accuracy: 0.9997 - loss: 0.0018 - val_accuracy: 0.9989 - val_loss: 0.0065
Epoch 3/3
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 158ms/step - accuracy: 0.9996 - loss: 0.0044 - val_accuracy: 0.9981 - val_loss: 0.0046
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.9946 - loss: 0.0242
Test Loss: 0.023861130699515343
Test Accuracy: 0.9957683682441711


In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


print(y_test)

loss, accuracy = model2.evaluate(X_test, y_test)
y_pred = model2.predict(X_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


y_pred_binary = np.round(y_pred).flatten()
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_test, y_pred_binary))
print('----------------')
print('Classification report')
print(classification_report(y_test, y_pred_binary))

[1 1 1 ... 1 0 0]
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.9946 - loss: 0.0242
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 43ms/step
Test Loss: 0.023861130699515343
Test Accuracy: 0.9957683682441711
----------------
Confusion matrix
[[4325    5]
 [  33 4617]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4330
           1       1.00      0.99      1.00      4650

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

