In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = "/root/data/"

df_fake = pd.read_csv(data_path + "Fake.csv")
df_true = pd.read_csv(data_path + "True.csv")
df_fake['label'] = 1  # Fake news label
df_true['label'] = 0   # Real news label
df = pd.concat([df_true,df_fake])
df = df.drop(['title','subject','date'],axis=1)

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_glove_vectors(glove_file_path):
    word_vectors = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

def preprocess_text(text):
    text = text.lower()                              # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)          # Remove special characters and numbers
    tokens = word_tokenize(text)                     # Tokenization
    stop_words = set(stopwords.words('english'))     # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatizer = WordNetLemmatizer()                 # Lemmatization  ex: running => run (not nesscessary, but i want to test)
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

glove_path = "/root/data/glove/"
word_vectors = load_glove_vectors(glove_path + "glove.6B.100d.txt")


df['clean_text'] = df['text'].apply(preprocess_text)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

def tokens_to_vectors(tokens):
    vectors = []
    for token in tokens:
        if token in word_vectors:
            vectors.append(word_vectors[token])
    if not vectors: # return none if no valid word vectors found 
        return None
    return np.mean(vectors, axis=0)

df['doc_vector'] = df['clean_text'].apply(tokens_to_vectors)
df = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(df['doc_vector'].tolist(), df['label'], test_size=0.2, random_state=42)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label,clean_text,doc_vector
0,WASHINGTON (Reuters) - The head of a conservat...,0,"[washington, reuters, head, conservative, repu...","[-0.077617005, 0.18136153, 0.26305065, -0.2329..."
1,WASHINGTON (Reuters) - Transgender people will...,0,"[washington, reuters, transgender, people, all...","[0.06912143, -0.054227337, 0.12197351, -0.0733..."
2,WASHINGTON (Reuters) - The special counsel inv...,0,"[washington, reuters, special, counsel, invest...","[-0.096503265, -0.047262732, 0.27596813, -0.22..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0,"[washington, reuters, trump, campaign, adviser...","[-0.092887245, -0.15168507, 0.23935626, -0.110..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0,"[seattlewashington, reuters, president, donald...","[-0.018570144, 0.09180134, 0.21725424, -0.1467..."


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



knn_classifier = KNeighborsClassifier(n_neighbors=10)  
knn_classifier.fit(X_train, y_train)

knn_y_pred = knn_classifier.predict(X_test)


knn_accuracy = accuracy_score(y_test, knn_y_pred)
print("Accuracy:", knn_accuracy)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(knn_y_pred,y_test))
print('----------------')
print('Classification report')
print(classification_report(knn_y_pred,y_test))


Accuracy: 0.9215708465368945
----------------
Confusion matrix
[[4135  531]
 [ 162 4008]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.96      0.89      0.92      4666
           1       0.88      0.96      0.92      4170

    accuracy                           0.92      8836
   macro avg       0.92      0.92      0.92      8836
weighted avg       0.92      0.92      0.92      8836



In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



rf_classifier = RandomForestClassifier(n_estimators=100)  
rf_classifier.fit(X_train, y_train)

rf_y_pred = rf_classifier.predict(X_test)


rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(rf_y_pred,y_test))
print('----------------')
print('Classification report')
print(classification_report(rf_y_pred,y_test))


Accuracy: 0.9488456315074695
----------------
Confusion matrix
[[4043  198]
 [ 254 4341]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      4241
           1       0.96      0.94      0.95      4595

    accuracy                           0.95      8836
   macro avg       0.95      0.95      0.95      8836
weighted avg       0.95      0.95      0.95      8836



In [5]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

height = 100
width = 1
channels = 1

X_train = np.array(X_train)
X_test = np.array(X_test)

X_train = np.squeeze(X_train)
X_test = np.squeeze(X_test)


print(X_train.shape)



model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(100, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


history = model.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

2024-05-28 08:21:10.849066: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 08:21:10.866758: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(35343, 100)


  super().__init__(


Epoch 1/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8832 - loss: 0.3070 - val_accuracy: 0.9052 - val_loss: 0.2161
Epoch 2/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9244 - loss: 0.1959 - val_accuracy: 0.9349 - val_loss: 0.1724
Epoch 3/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9280 - loss: 0.1825 - val_accuracy: 0.9417 - val_loss: 0.1587
Epoch 4/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9365 - loss: 0.1667 - val_accuracy: 0.9211 - val_loss: 0.1903
Epoch 5/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9347 - loss: 0.1651 - val_accuracy: 0.9440 - val_loss: 0.1516
Epoch 6/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9409 - loss: 0.1568 - val_accuracy: 0.9463 - val_loss: 0.1426
Epoch 7/50
[1m249/249[0m 

In [6]:
y_pred_cnn = np.round(y_pred).flatten()
cnn_accuracy = accuracy_score(y_test, y_pred_cnn)
print('----------------')
print('Confusion matrix')
print(confusion_matrix(y_test, y_pred_cnn))
print('----------------')
print('Classification report')
print(classification_report(y_test, y_pred_cnn))

----------------
Confusion matrix
[[4142  155]
 [ 135 4404]]
----------------
Classification report
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      4297
           1       0.97      0.97      0.97      4539

    accuracy                           0.97      8836
   macro avg       0.97      0.97      0.97      8836
weighted avg       0.97      0.97      0.97      8836

