In [1]:
# Mount Google Drive to access files (if needed)
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef, accuracy_score
from sklearn.svm import SVC
import gensim
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a function for text preprocessing (stemming and cleaning)
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    return text

# Load dataset (adjust path as per your file location)
news_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/fakeNewsData.csv')
news_dataset = news_dataset.fillna('')

# Combine author and title into content
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']
news_dataset['content'] = news_dataset['content'].apply(preprocess_text)

# Train Word2Vec model
wv_model = gensim.models.Word2Vec(sentences=news_dataset['content'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for a sentence
def word_averaging(wv_model, words):
    all_words, mean = set(), []

    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv_model.wv.key_to_index:
            mean.append(wv_model.wv[word])
            all_words.add(wv_model.wv.key_to_index[word])

    if not mean:
        return np.zeros(wv_model.vector_size,)

    mean = np.array(mean).mean(axis=0)
    return mean

# Function to compute Word2Vec vectors for a list of sentences
def word_averaging_list(wv_model, text_list):
    return np.vstack([word_averaging(wv_model, review) for review in text_list])

# Split dataset into train and test sets
X = word_averaging_list(wv_model, news_dataset['content'])
y = news_dataset['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel='rbf', random_state=1)
svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_test)

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Calculate evaluation metrics

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# AUC Score
y_pred_prob = svm_model.decision_function(X_test)
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"AUC Score: {auc_score:.4f}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")

# Sensitivity (Recall)
recall = recall_score(y_test, y_pred)
print(f"Sensitivity (Recall): {recall:.4f}")

# Specificity
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print(f"Specificity: {specificity:.4f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

Confusion Matrix:
[[2773  375]
 [ 235 2857]]
AUC Score: 0.9621
Accuracy: 0.9022
Matthews Correlation Coefficient (MCC): 0.8054
Sensitivity (Recall): 0.9240
Specificity: 0.8809
Precision: 0.8840
F1 Score: 0.9035
