In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = "/root/data/"

df_fake = pd.read_csv(data_path + "Fake.csv")
df_true = pd.read_csv(data_path + "True.csv")
df_fake['label'] = 1  # Fake news label
df_true['label'] = 0   # Real news label

In [2]:
df = pd.concat([df_true,df_fake])
df = df.drop(['title','subject','date'],axis=1)

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_glove_vectors(glove_file_path):
    word_vectors = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

def preprocess_text(text):
    text = text.lower()                              # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)          # Remove special characters and numbers
    tokens = word_tokenize(text)                     # Tokenization
    stop_words = set(stopwords.words('english'))     # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatizer = WordNetLemmatizer()                 # Lemmatization  ex: running => run (not nesscessary, but i want to test)
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

glove_path = "/root/data/glove/"
word_vectors = load_glove_vectors(glove_path + "glove.6B.100d.txt")


df['clean_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

def tokens_to_vectors(tokens):
    vectors = []
    for token in tokens:
        if token in word_vectors:
            vectors.append(word_vectors[token])
    if not vectors: # return none if no valid word vectors found 
        return None
    return np.mean(vectors, axis=0)

df['doc_vector'] = df['clean_text'].apply(tokens_to_vectors)
df = df.dropna()



df_2nd = pd.read_csv(data_path +"2nd/" +"train.csv")
df_2nd = df_2nd.drop(['title','author','id'],axis=1)

df_2nd['text'] = df_2nd['text'].astype(str)
df_2nd['clean_text'] = df_2nd['text'].apply(preprocess_text)
df_2nd['doc_vector'] = df_2nd['clean_text'].apply(tokens_to_vectors)
df_2nd = df_2nd.dropna()

df_mix = pd.concat([df,df_2nd])

X_train, X_test, y_train, y_test = train_test_split(df_mix['doc_vector'].to_list() , df_mix['label'], test_size=0.2, random_state=42)

X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



knn_classifier = KNeighborsClassifier(n_neighbors=10)  
knn_classifier.fit(X_train, y_train)

knn_y_pred = knn_classifier.predict(X_test)


knn_accuracy = accuracy_score(y_test, knn_y_pred)
print("Accuracy:", knn_accuracy)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(knn_y_pred,y_test))
print('----------------')
print('Classification report')
print(classification_report(knn_y_pred,y_test))


Accuracy: 0.8430631603300687
----------------
Confusion matrix
[[5853 1562]
 [ 473 5079]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.93      0.79      0.85      7415
           1       0.76      0.91      0.83      5552

    accuracy                           0.84     12967
   macro avg       0.85      0.85      0.84     12967
weighted avg       0.86      0.84      0.84     12967



In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



rf_classifier = RandomForestClassifier(n_estimators=100)  
rf_classifier.fit(X_train, y_train)

rf_y_pred = rf_classifier.predict(X_test)


rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(rf_y_pred,y_test))
print('----------------')
print('Classification report')
print(classification_report(rf_y_pred,y_test))


Accuracy: 0.8840903832806355
----------------
Confusion matrix
[[5389  566]
 [ 937 6075]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.85      0.90      0.88      5955
           1       0.91      0.87      0.89      7012

    accuracy                           0.88     12967
   macro avg       0.88      0.89      0.88     12967
weighted avg       0.89      0.88      0.88     12967



In [7]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

height = 100
width = 1
channels = 1

X_train = np.array(X_train)
X_test = np.array(X_test)

X_train = np.squeeze(X_train)
X_test = np.squeeze(X_test)


print(X_train.shape)



model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(100, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


history = model.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

2024-05-28 08:41:01.525038: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 08:41:01.545860: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(51866, 100)


  super().__init__(


Epoch 1/50
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7949 - loss: 0.4525 - val_accuracy: 0.8477 - val_loss: 0.3468
Epoch 2/50
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8597 - loss: 0.3280 - val_accuracy: 0.8356 - val_loss: 0.3638
Epoch 3/50
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8641 - loss: 0.3131 - val_accuracy: 0.8649 - val_loss: 0.3110
Epoch 4/50
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8738 - loss: 0.3005 - val_accuracy: 0.8812 - val_loss: 0.2871
Epoch 5/50
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8797 - loss: 0.2887 - val_accuracy: 0.8834 - val_loss: 0.2763
Epoch 6/50
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8849 - loss: 0.2769 - val_accuracy: 0.8672 - val_loss: 0.3103
Epoch 7/50
[1m365/365[0m 

In [8]:
y_pred_cnn = np.round(y_pred).flatten()
cnn_accuracy = accuracy_score(y_test, y_pred_cnn)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_test, y_pred_cnn))
print('----------------')
print('Classification report')
print(classification_report(y_test, y_pred_cnn))

----------------
Confusion matrix
[[5550  776]
 [ 376 6265]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      6326
           1       0.89      0.94      0.92      6641

    accuracy                           0.91     12967
   macro avg       0.91      0.91      0.91     12967
weighted avg       0.91      0.91      0.91     12967

