### 🚀 Task 1: Load Datasets

In [None]:
import pandas as pd
import nltk
nltk.download("stopwords")

# Load the training dataset
train_path = "fake_train_2024_2025.csv"
df_train = pd.read_csv(train_path)

# Load the testing dataset
test_path = "fake_test_2024_2025.csv"
df_test = pd.read_csv(test_path)

# Display both datasets

df_train.head()
df_test.head()

print("✅ Training and Testing datasets loaded successfully!")


### 🚀 Task 2: Preprocessing the Text

In [None]:
import spacy
import re
from nltk.corpus import stopwords

# Load Spacy Model
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation & special characters
    doc = nlp(text)  # Tokenize & Lemmatize
    words = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(words)

# Apply preprocessing
df_train["processed_text"] = df_train["text"].astype(str).apply(preprocess_text)
df_test["processed_text"] = df_test["text"].astype(str).apply(preprocess_text)

# Display cleaned data
tools.display_dataframe_to_user(name="Preprocessed Fake News Training", dataframe=df_train[["text", "processed_text"]])
tools.display_dataframe_to_user(name="Preprocessed Fake News Testing", dataframe=df_test[["text", "processed_text"]])

print("✅ Text preprocessing completed!")


### 🚀 Task 3: Train a Doc2Vec Model

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Prepare tagged documents
train_tagged = [TaggedDocument(words=row.split(), tags=[str(i)]) for i, row in enumerate(df_train["processed_text"])]
test_tagged = [TaggedDocument(words=row.split(), tags=[str(i + len(df_train))]) for i, row in enumerate(df_test["processed_text"])]

# Combine train and test for model training
all_tagged = train_tagged + test_tagged

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=300, window=4, min_count=2, workers=4, epochs=20)
doc2vec_model.build_vocab(all_tagged)
doc2vec_model.train(all_tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

print("✅ Doc2Vec model trained successfully on the full corpus!")


### 🚀 Task 4: Train Logistic Regression Using Doc2Vec Embeddings

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Extract embeddings
X_train = [doc2vec_model.dv[str(i)] for i in range(len(df_train))]
X_test = [doc2vec_model.dv[str(i + len(df_train))] for i in range(len(df_test))]

# Get labels
y_train = df_train["label"]
y_test = df_test["label"]

# Train logistic regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Logistic Regression Model Accuracy: {accuracy:.4f}")


### 🚀 Task 5: Train Doc2Vec Only on Train-Corpus

In [None]:
# Train Doc2Vec model only on training data
doc2vec_train_only = Doc2Vec(vector_size=300, window=4, min_count=2, workers=4, epochs=20)
doc2vec_train_only.build_vocab(train_tagged)
doc2vec_train_only.train(train_tagged, total_examples=doc2vec_train_only.corpus_count, epochs=doc2vec_train_only.epochs)

# Extract embeddings (test corpus is unobserved)
X_train_only = [doc2vec_train_only.dv[str(i)] for i in range(len(df_train))]
X_test_only = [doc2vec_train_only.infer_vector(row.split()) for row in df_test["processed_text"]]

# Train logistic regression
lr_model_train_only = LogisticRegression()
lr_model_train_only.fit(X_train_only, y_train)

# Predict on test set
y_pred_train_only = lr_model_train_only.predict(X_test_only)

# Evaluate performance
accuracy_train_only = accuracy_score(y_test, y_pred_train_only)
print(f"✅ Logistic Regression Model Accuracy (Train-Only): {accuracy_train_only:.4f}")


### 🚀 Task 6: Train Word2Vec and Average Word Embeddings

In [None]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize sentences
sentences = [row.split() for row in df_train["processed_text"]]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=sentences, vector_size=300, window=4, min_count=2, workers=4)

# Function to compute document embeddings by averaging word vectors
def document_embedding(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Compute document embeddings
X_train_w2v = np.array([document_embedding(row, word2vec_model) for row in df_train["processed_text"]])
X_test_w2v = np.array([document_embedding(row, word2vec_model) for row in df_test["processed_text"]])

# Train logistic regression
lr_model_w2v = LogisticRegression()
lr_model_w2v.fit(X_train_w2v, y_train)

# Predict on test set
y_pred_w2v = lr_model_w2v.predict(X_test_w2v)

# Evaluate performance
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
print(f"✅ Logistic Regression Model Accuracy (Word2Vec Averaging): {accuracy_w2v:.4f}")
