<a href="https://colab.research.google.com/github/prrmzz/FakeNewsDetection/blob/main/FakeNewsDetection_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, Conv1D, MaxPool1D, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
def load_datasets(base_path):
    """
    Load train, test, and validation datasets from separate TSV files.
    """
    datasets = {}
    for split in ["train", "test", "valid"]:
        file_path = os.path.join(base_path, f"{split}.tsv")
        datasets[split] = pd.read_csv(file_path, sep='\t', header=None, names=[
            "id", "label", "statement", "subject", "speaker", "job_title", "state_info",
            "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts",
            "mostly_true_counts", "pants_on_fire_counts", "context"
        ])
    return datasets

In [4]:
def preprocess_text(data, column):

    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)

    stop_words = set(stopwords.words('english'))
    translator = str.maketrans('', '', punctuation)

    def clean_text(text):
        tokens = word_tokenize(text.lower())
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if w not in stop_words and w.isalpha()]
        return ' '.join(tokens)

    data[column] = data[column].apply(clean_text)
    return data

In [5]:
def simplify_labels(data):

    data['binary_label'] = data['label'].apply(lambda x: 1 if x in ['half-true', 'mostly-true', 'true'] else 0)
    return data

In [6]:
def prepare_sequences(texts, tokenizer=None, max_len=None):

    if not tokenizer:
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    if not max_len:
        max_len = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded_sequences, tokenizer, max_len

In [7]:
def build_cnn_model(input_length, vocab_size):

    input_layer = Input(shape=(input_length,))
    embedding_layer = Embedding(vocab_size, 128)(input_layer)
    conv_layer = Conv1D(filters=64, kernel_size=4, activation='relu')(embedding_layer)
    dropout_layer = Dropout(0.5)(conv_layer)
    pooling_layer = MaxPool1D(pool_size=2)(dropout_layer)
    flatten_layer = Flatten()(pooling_layer)
    dense_layer = Dense(64, activation='relu')(flatten_layer)
    output_layer = Dense(1, activation='sigmoid')(dense_layer)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
def train_and_evaluate(datasets, base_path):

    train_data, test_data, valid_data = datasets['train'], datasets['test'], datasets['valid']

    train_data = preprocess_text(train_data, 'statement')
    train_data = simplify_labels(train_data)

    test_data = preprocess_text(test_data, 'statement')
    test_data = simplify_labels(test_data)

    valid_data = preprocess_text(valid_data, 'statement')
    valid_data = simplify_labels(valid_data)

    X_train_padded, tokenizer, max_len = prepare_sequences(train_data['statement'])
    X_test_padded, _, _ = prepare_sequences(test_data['statement'], tokenizer, max_len)
    X_valid_padded, _, _ = prepare_sequences(valid_data['statement'], tokenizer, max_len)

    y_train = train_data['binary_label']
    y_test = test_data['binary_label']
    y_valid = valid_data['binary_label']

    vocab_size = len(tokenizer.word_index) + 1
    model = build_cnn_model(max_len, vocab_size)
    model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_valid_padded, y_valid))

    model.save(os.path.join(base_path, 'liar_cnn_model.h5'))
    with open(os.path.join(base_path, 'tokenizer.pkl'), 'wb') as f:
        import pickle
        pickle.dump(tokenizer, f)

    loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
    print(f"Test Loss: {loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=["Fake", "True"]))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [12]:
import nltk
nltk.download('punkt_tab')

def main():

    base_path = '/content/drive/MyDrive/LIAR.ds'

    print("Starting the process...")

    try:
        print("Loading datasets...")
        datasets = load_datasets(base_path)

        for split, data in datasets.items():
            print(f"Loaded {split} dataset with {len(data)} records.")

        print("Starting training and evaluation...")
        train_and_evaluate(datasets, base_path)

        print("Process completed successfully!")

    except FileNotFoundError as fnfe:
        print(f"FileNotFoundError: {fnfe}. Please verify that the path and files are correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Starting the process...
Loading datasets...
Loaded train dataset with 10240 records.
Loaded test dataset with 1267 records.
Loaded valid dataset with 1284 records.
Starting training and evaluation...
Epoch 1/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 88ms/step - accuracy: 0.5628 - loss: 0.6823 - val_accuracy: 0.5857 - val_loss: 0.6708
Epoch 2/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 91ms/step - accuracy: 0.7282 - loss: 0.5598 - val_accuracy: 0.5717 - val_loss: 0.7245
Epoch 3/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 86ms/step - accuracy: 0.8997 - loss: 0.2569 - val_accuracy: 0.5740 - val_loss: 0.8885
Epoch 4/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 90ms/step - accuracy: 0.9671 - loss: 0.1002 - val_accuracy: 0.5802 - val_loss: 1.3207
Epoch 5/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 90ms/step - accuracy: 0.9823 - loss: 0.0492 - val_accuracy: 0.5771



Test Loss: 2.7315
Test Accuracy: 0.5588
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step

Classification Report:
              precision    recall  f1-score   support

        Fake       0.49      0.46      0.48       553
        True       0.60      0.63      0.62       714

    accuracy                           0.56      1267
   macro avg       0.55      0.55      0.55      1267
weighted avg       0.56      0.56      0.56      1267


Confusion Matrix:
[[256 297]
 [262 452]]
Process completed successfully!
