In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import numpy as np

In [3]:
fake = pd.read_csv("fake.csv")
true = pd.read_csv("true.csv")

In [4]:
fake['category']=1
true['category']=0

df=pd.concat([fake,true]).reset_index(drop=True)

In [5]:
# Separate the majority and minority classes
df_majority = df[df['category'] == 1]
df_minority = df[df['category'] == 0]

# Sample 3000 data points from each class
df_majority_sampled = df_majority.sample(n=3000, random_state=42)
df_minority_sampled = df_minority.sample(n=3000, random_state=42)

# Combine the sampled data
df_balanced = pd.concat([df_majority_sampled, df_minority_sampled])

# Shuffle the dataframe
df= df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset if needed
# df_balanced.to_csv('balanced_dataset.csv', index=False)

In [6]:
df_majority = df[df['category'] == 1]
df_minority = df[df['category'] == 0]

# Undersample the majority class
df_majority_undersampled = df_majority.sample(len(df_minority), random_state=42)

# Combine the undersampled majority class with the minority class
df = pd.concat([df_majority_undersampled, df_minority])

In [7]:
df=df[['text','category']]

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

In [9]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [11]:
# Word2Vec Vectorization
w2v_model = Word2Vec(sentences=X_train.str.split(), vector_size=100, window=5, min_count=1, workers=4)
# Function to average Word2Vec vectors
def document_vector(tokens, model):
    vec = np.zeros(model.vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Create feature vectors for train and test data
X_train_w2v = np.array([document_vector(tokens, w2v_model) for tokens in X_train])
X_test_W2V = np.array([document_vector(tokens, w2v_model) for tokens in X_test])

In [12]:
# Train Random Forest Models
rf_tfidf = RandomForestClassifier(random_state=42)
rf_count = RandomForestClassifier(random_state=42)
rf_w2v = RandomForestClassifier(random_state=42)
rf_glove = RandomForestClassifier(random_state=42)

rf_tfidf.fit(X_train_tfidf, y_train)
rf_count.fit(X_train_count, y_train)
rf_w2v.fit(X_train_w2v, y_train)


In [13]:
# Make Predictions and Evaluate
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)
y_pred_count = rf_count.predict(X_test_count)
y_pred_w2v = rf_w2v.predict(X_test_W2V)


accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
accuracy_count = accuracy_score(y_test, y_pred_count)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)


report_tfidf = classification_report(y_test, y_pred_tfidf)
report_count = classification_report(y_test, y_pred_count)
report_w2v = classification_report(y_test, y_pred_w2v)


print(f'TF-IDF Accuracy: {accuracy_tfidf}')
print('TF-IDF Classification Report:')
print(report_tfidf)

print(f'Count Vectorization Accuracy: {accuracy_count}')
print('Count Vectorization Classification Report:')
print(report_count)

print(f'Word2Vec Accuracy: {accuracy_w2v}')
print('Word2Vec Classification Report:')
print(report_w2v)

TF-IDF Accuracy: 0.9758333333333333
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       587
           1       0.98      0.97      0.98       613

    accuracy                           0.98      1200
   macro avg       0.98      0.98      0.98      1200
weighted avg       0.98      0.98      0.98      1200

Count Vectorization Accuracy: 0.9775
Count Vectorization Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       587
           1       0.99      0.96      0.98       613

    accuracy                           0.98      1200
   macro avg       0.98      0.98      0.98      1200
weighted avg       0.98      0.98      0.98      1200

Word2Vec Accuracy: 0.7366666666666667
Word2Vec Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.77      0.74       587
           1       0.