In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
import pandas as pd

real_df = pd.read_csv("real_news_cleaned.csv")
fake_df = pd.read_csv("fake_news_cleaned.csv")

print("Real news:", real_df.shape)
print("Fake news:", fake_df.shape)


Real news: (124, 2)
Fake news: (2834, 2)


In [10]:
# Rename columns to a common name
real_df = real_df.rename(columns={"title": "text"})
real_df["text"] = real_df["text"].astype(str)

fake_df = fake_df.rename(columns={"statement": "text"})

# Keep only required columns
real_df = real_df[["text", "label"]]
fake_df = fake_df[["text", "label"]]



In [11]:
fake_sampled = fake_df.sample(n=len(real_df), random_state=42)

df = pd.concat([real_df, fake_sampled], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df["label"].value_counts())



label
1    124
0    124
Name: count, dtype: int64


In [12]:

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 198
Testing samples: 50


In [13]:


vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=2,
    max_df=0.85,
    ngram_range=(1,2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Vectorization done")

Vectorization done
