In [2]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import gensim.downloader as api


In [4]:
# Load datasets
fake = pd.read_csv("/content/drive/MyDrive/NLP PROJECT/DATASET/Fake.csv")
true = pd.read_csv("/content/drive/MyDrive/NLP PROJECT/DATASET/True.csv")


In [5]:

# Add category labels
fake['category'] = 1
true['category'] = 0

In [6]:
# Concatenate datasets and reset index
df = pd.concat([fake, true]).reset_index(drop=True)

In [7]:
# Sample 3000 data points from each class
df_majority = df[df['category'] == 1].sample(n=3000, random_state=42)
df_minority = df[df['category'] == 0].sample(n=3000, random_state=42)


In [8]:
# Combine the sampled data and shuffle the dataframe
df_balanced = pd.concat([df_majority, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
# Undersample the majority class to balance the dataset
df_majority = df_balanced[df_balanced['category'] == 1]
df_minority = df_balanced[df_balanced['category'] == 0]
df_majority_undersampled = df_majority.sample(len(df_minority), random_state=42)
df = pd.concat([df_majority_undersampled, df_minority])


In [10]:

# Keep only necessary columns
df = df[['text', 'category']]

In [11]:
# Load SpaCy model and GloVe embeddings
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation
model = api.load('glove-twitter-100')




In [12]:

# Function to tokenize and preprocess text
def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() for word in doc]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens

In [13]:
# Apply tokenization to each row in 'text' column
df['tokens'] = df['text'].apply(spacy_tokenizer)



In [14]:
# Generate document vectors for each document
def document_vector(tokens, embeddings, dim):
    token_vectors = [embeddings[token] for token in tokens if token in embeddings]
    if not token_vectors:
        return np.zeros(dim)
    return np.mean(token_vectors, axis=0)

In [15]:
df['vec'] = df['tokens'].apply(lambda x: document_vector(x, model, model.vector_size))

In [16]:
# Prepare features (X) and labels (y)
X = np.vstack(df['vec'])
y = df['category']


In [17]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)


In [18]:
# Create a pipeline with a Decision Tree classifier
model_pipeline_dt = Pipeline([
    ('dt', DecisionTreeClassifier(random_state=42))
])

In [19]:
# Train the model
model_pipeline_dt.fit(X_train, y_train)

In [20]:
# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

In [21]:
# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_score_dt = f1_score(y_test, y_pred_dt, average='weighted')
classification_report_dt = classification_report(y_test, y_pred_dt)


In [22]:
# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------------")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1-score: {f1_score_dt:.4f}")
print("Classification Report:")
print(classification_report_dt)

Evaluation Metrics for Decision Tree Model
------------------------------------------------
Accuracy: 0.8442
Precision: 0.8443
Recall: 0.8442
F1-score: 0.8441
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       592
           1       0.84      0.86      0.85       608

    accuracy                           0.84      1200
   macro avg       0.84      0.84      0.84      1200
weighted avg       0.84      0.84      0.84      1200



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
