In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api

from google.colab import drive
drive.mount('/content/drive')

# Load datasets
fake = pd.read_csv("/content/drive/MyDrive/NLP FAKE NEWS DETECTION/nlp dfnds/Fake.csv")
true = pd.read_csv("/content/drive/MyDrive/NLP FAKE NEWS DETECTION/nlp dfnds/True.csv")

# Add category labels
fake['category'] = 1
true['category'] = 0

# Concatenate into one dataframe
df = pd.concat([fake, true]).reset_index(drop=True)

# Balance the dataset
df_majority = df[df['category'] == 1].sample(n=3000, random_state=42)
df_minority = df[df['category'] == 0].sample(n=3000, random_state=42)
df_balanced = pd.concat([df_majority, df_minority])

# Shuffle the dataframe
df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate into majority and minority classes
df_majority = df[df['category'] == 1]
df_minority = df[df['category'] == 0]

# Undersample the majority class
df_majority_undersampled = df_majority.sample(len(df_minority), random_state=42)

# Combine undersampled majority class with minority class
df = pd.concat([df_majority_undersampled, df_minority])
df = df[['text', 'category']]

# Load SpaCy model and define stop words and punctuations
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

# Load pre-trained Word2Vec embeddings
wv = api.load('word2vec-google-news-300')

# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() for word in doc]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens

# Apply tokenization to each row in 'data' column
df['data'] = df['text'].apply(spacy_tokenizer)

# Generate word vectors for each document
df['vec'] = df['data'].apply(lambda x: np.mean([wv[token] for token in x if token in wv] or [np.zeros(wv.vector_size)], axis=0))

# Prepare X (features) and y (labels)
X = np.vstack(df['vec'])  # Convert list of arrays to a matrix
y = df['category']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)


Mounted at /content/drive


In [2]:
# Define RandomForestClassifier pipeline
model_pipeline_rf = Pipeline([
    ('rf', RandomForestClassifier(random_state=1))  # Use default parameters for RandomForestClassifier
])

# Train the model
model_pipeline_rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)


In [3]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_score_rf = f1_score(y_test, y_pred_rf, average='weighted')
classification_report_rf = classification_report(y_test, y_pred_rf)

# Print evaluation metrics
print("Evaluation Metrics for RandomForestClassifier Model")
print("---------------------------------------------------")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-score: {f1_score_rf:.4f}")
print("Classification Report:")
print(classification_report_rf)


Evaluation Metrics for RandomForestClassifier Model
---------------------------------------------------
Accuracy: 0.9100
Precision: 0.9109
Recall: 0.9100
F1-score: 0.9100
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       592
           1       0.93      0.89      0.91       608

    accuracy                           0.91      1200
   macro avg       0.91      0.91      0.91      1200
weighted avg       0.91      0.91      0.91      1200

