**FAKE NEWS DETECTION USING NLP**

**STEP 1: IMPORTING THE LIBRARIES**

In [14]:
# Import necessary libraries for the project
import numpy as np                # NumPy for numerical operations
import pandas as pd               # Pandas for data manipulation
import matplotlib.pyplot as plt   # Matplotlib for data visualization
import seaborn as sns             # Seaborn for enhanced data visualization
import nltk                       # Natural Language Toolkit for NLP
from nltk.corpus import stopwords # NLTK's stopwords for text preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer  # Scikit-learn's TF-IDF vectorizer
from sklearn.model_selection import train_test_split        # Scikit-learn for data splitting
from sklearn.naive_bayes import MultinomialNB                # Scikit-learn's Naive Bayes classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Scikit-learn's metrics
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**STEP 2: DATA LOADING**

In [2]:
# Load the dataset from Kaggle
fake_news = pd.read_csv("/Fake.csv")
real_news = pd.read_csv("/True.csv")


**STEP 3: DATA PREPROCESSING**

In [3]:
# Combine the real and fake news datasets
fake_news['label'] = 1
real_news['label'] = 0
data = pd.concat([fake_news, real_news])

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)


In [6]:
# Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

data['text'] = data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print(data['text'])

0        claiming least racist person , donald trump li...
1        dunkin donuts american global donut company co...
2        london ( reuters ) - britain made substantive ...
3        washington ( reuters ) - republican party ’ tw...
4        anyone else wondering cop-hating , racist , be...
                               ...                        
44893    30 years , donald trump built great negotiator...
44894    washington ( reuters ) - united states monday ...
44895    sarah palin marked return national stage part ...
44896    seoul ( reuters ) - south korean president moo...
44897    samarkand , uzbekistan ( reuters ) - senior of...
Name: text, Length: 44898, dtype: object


**STEP 4: FEATURE EXTRACTION**

In [9]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']


**STEP 5: MODEL TRAINING AND EVALUATION**

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classification model (Naive Bayes)
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)


Accuracy: 0.9359688195991092
Confusion Matrix:
 [[3983  301]
 [ 274 4422]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.93      4284
           1       0.94      0.94      0.94      4696

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



**LOGISTIC REGRESSION AND NEURAL NETWORKS**

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [23]:
# Load the dataset from Kaggle
fake_data = pd.read_csv("/Fake.csv")
true_data = pd.read_csv("/True.csv")

In [24]:
fake_data['label'] = 0  # 0 for fake news
true_data['label'] = 1  # 1 for true news
combined_data = pd.concat([fake_data, true_data], ignore_index=True)
combined_data['text'] = combined_data['title'] + " " + combined_data['text']


In [25]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_data['text'])


In [26]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, combined_data['label'], test_size=0.2, random_state=42)


In [27]:
# Logistic Regression Model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)


In [28]:
# Model Training (Neural Network)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(combined_data['text'])
X_train_nn = tokenizer.texts_to_sequences(combined_data['text'])
X_train_nn = pad_sequences(X_train_nn, maxlen=100)

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_nn, combined_data['label'], epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d786af8b700>

In [29]:
y_pred = logistic_regression_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


In [31]:
print(f"Logistic Regression Accuracy: {accuracy}")
print(f"Logistic Regression Precision: {precision}")
print(f"Logistic Regression Recall: {recall}")
print(f"Logistic Regression F1-Score: {f1}")
print(f"Logistic Regression ROC-AUC: {roc_auc}")


Logistic Regression Accuracy: 0.9979286193847656
Logistic Regression Precision: 0.9866008462623413
Logistic Regression Recall: 0.9882269837532376
Logistic Regression F1-Score: 0.9874132455005294
Logistic Regression ROC-AUC: 0.9880919410631812


In [30]:
X_test_nn = tokenizer.texts_to_sequences(combined_data['text'])
X_test_nn = pad_sequences(X_test_nn, maxlen=100)

loss, accuracy = model.evaluate(X_test_nn, combined_data['label'])




In [32]:
print(f"Neural Network Accuracy: {accuracy}")


Neural Network Accuracy: 0.9979286193847656
