In [3]:
import pandas as pd

# Load real and fake news datasets
real_news = pd.read_csv("C://Users//shrey//Downloads//politifact_real.csv")  # Replace with actual file path
fake_news = pd.read_csv("C://Users//shrey//Downloads//politifact_fake.csv")  # Replace with actual file path

# Label datasets
real_news['label'] = 1  # Label for real news
fake_news['label'] = 0  # Label for fake news

# Combine datasets
data = pd.concat([real_news, fake_news], ignore_index=True)


In [4]:
# Shuffle the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


In [6]:
import re
from sklearn.model_selection import train_test_split

# Define the preprocessing function
def preprocess_text(text):
    # Remove special characters, numbers, and lowercase the text
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing to the 'title' column
data['title'] = data['title'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = data['title']  # Features (preprocessed titles)
y = data['label']  # Labels (real or fake news)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Features (X) and target (y)
X = data['title']
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = lr_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Logistic Regression Accuracy: 0.7830188679245284
              precision    recall  f1-score   support

           0       0.84      0.59      0.69        87
           1       0.76      0.92      0.83       125

    accuracy                           0.78       212
   macro avg       0.80      0.75      0.76       212
weighted avg       0.79      0.78      0.77       212

[[ 51  36]
 [ 10 115]]


In [10]:
from sklearn.svm import SVC

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


SVM Accuracy: 0.8207547169811321
              precision    recall  f1-score   support

           0       0.84      0.70      0.76        87
           1       0.81      0.90      0.86       125

    accuracy                           0.82       212
   macro avg       0.82      0.80      0.81       212
weighted avg       0.82      0.82      0.82       212

[[ 61  26]
 [ 12 113]]


In [11]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Naive Bayes Accuracy: 0.8160377358490566
              precision    recall  f1-score   support

           0       0.88      0.64      0.74        87
           1       0.79      0.94      0.86       125

    accuracy                           0.82       212
   macro avg       0.83      0.79      0.80       212
weighted avg       0.83      0.82      0.81       212

[[ 56  31]
 [  8 117]]


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
maxlen = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Build the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=maxlen),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile and train
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate
y_pred = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")
print("LSTM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))




Epoch 1/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 519ms/step - accuracy: 0.5449 - loss: 0.6684 - val_accuracy: 0.7176 - val_loss: 0.6333
Epoch 2/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 508ms/step - accuracy: 0.7570 - loss: 0.5977 - val_accuracy: 0.7176 - val_loss: 0.6065
Epoch 3/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 510ms/step - accuracy: 0.8087 - loss: 0.4867 - val_accuracy: 0.7412 - val_loss: 0.5667
Epoch 4/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 494ms/step - accuracy: 0.8759 - loss: 0.3530 - val_accuracy: 0.7765 - val_loss: 0.5120
Epoch 5/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 411ms/step - accuracy: 0.9437 - loss: 0.1664 - val_accuracy: 0.7882 - val_loss: 0.5454
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 169ms/step
LSTM Accuracy: 0.8301886792452831
              precision    recall  f1-score   support

           0       0.88      0.6

In [13]:
from sklearn.naive_bayes import MultinomialNB

# Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Evaluate the Naive Bayes model
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.8160377358490566
              precision    recall  f1-score   support

           0       0.88      0.64      0.74        87
           1       0.79      0.94      0.86       125

    accuracy                           0.82       212
   macro avg       0.83      0.79      0.80       212
weighted avg       0.83      0.82      0.81       212

[[ 56  31]
 [  8 117]]


In [16]:
!pip install tf-keras

Collecting tf-keras
  Obtaining dependency information for tf-keras from https://files.pythonhosted.org/packages/8a/ed/e08afca471299b04a34cd548e64e89d0153eda0e6cf9b715356777e24774/tf_keras-2.18.0-py3-none-any.whl.metadata
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow<2.19,>=2.18 (from tf-keras)
  Obtaining dependency information for tensorflow<2.19,>=2.18 from https://files.pythonhosted.org/packages/cf/24/271e77c22724f370c24c705f394b8035b4d27e4c2c6339f3f45ab9b8258e/tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Obtaining dependency information for tensorflow-intel==2.18.0 from https://files.pythonhosted.org/packages/76/ad/fa6c508a15ff79cb5409294c293388e0999b7d480f84b65e4287277434fe/tensorflow_intel-2.18.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.18.0-cp311-cp311

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\shrey\\anaconda3\\Lib\\site-packages\\~umpy\\core\\_multiarray_tests.cp311-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [None]:
!pip install --upgrade transformers
!pip install --upgrade torch




In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

# Tokenize input data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_tok = tokenizer(list(X_train), max_length=128, truncation=True, padding=True, return_tensors='tf')
X_test_tok = tokenizer(list(X_test), max_length=128, truncation=True, padding=True, return_tensors='tf')

# Load BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
bert_model.compile(optimizer=Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Train BERT
bert_model.fit(X_train_tok['input_ids'], y_train, epochs=2, batch_size=16, validation_split=0.1)

# Evaluate BERT
y_pred_bert = (bert_model.predict(X_test_tok['input_ids']).logits > 0.5).numpy().astype("int32")
print("BERT Accuracy:", accuracy_score(y_test, y_pred_bert))
print(classification_report(y_test, y_pred_bert))
print(confusion_matrix(y_test, y_pred_bert))
