In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [9]:
import pandas as pd

real_df = pd.read_csv("real_news_cleaned.csv")
fake_df = pd.read_csv("fake_news_cleaned.csv")

real_df = real_df.rename(columns={"title": "text"})
fake_df = fake_df.rename(columns={"statement": "text"})

df = pd.concat([real_df, fake_df], ignore_index=True)



In [10]:
real_df = df[df["label"] == 1]
fake_df = df[df["label"] == 0]

fake_df_sampled = fake_df.sample(
    n=len(real_df),
    random_state=42
)

df = pd.concat([real_df, fake_df_sampled])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)




In [11]:

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 198
Testing samples: 50


In [22]:


vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=8000,
    min_df=2,
    max_df=0.85,
    ngram_range=(1,2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Vectorization done")

Vectorization done


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [24]:
lr_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

lr_model.fit(X_train_vec, y_train)



In [25]:
y_pred = lr_model.predict(X_test_vec)


In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.94

Confusion Matrix:
 [[26  2]
 [ 1 21]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.95        28
           1       0.91      0.95      0.93        22

    accuracy                           0.94        50
   macro avg       0.94      0.94      0.94        50
weighted avg       0.94      0.94      0.94        50



In [32]:
def predict_news(text):
    vec = vectorizer.transform([text])
    pred = lr_model.predict(vec)[0]
    prob = lr_model.predict_proba(vec)[0]

    label = "Real News" if pred == 1 else "Fake News"

    print(f"\nText: {text}")
    print(f"Probabilities [Fake, Real]: {prob}")
    print(f"Prediction: {label}")

    return label






In [33]:
df["label"].value_counts()



Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,124
0,124


In [34]:
print(lr_model.classes_)


[0 1]


In [35]:
predict_news("Aliens are controlling elections using 5G towers")
predict_news("Government announces education policy for students")




Text: Aliens are controlling elections using 5G towers
Probabilities [Fake, Real]: [0.50786005 0.49213995]
Prediction: Fake News

Text: Government announces education policy for students
Probabilities [Fake, Real]: [0.68491993 0.31508007]
Prediction: Fake News


'Fake News'

In [36]:
predict_news("Aliens are controlling elections using 5G towers")
predict_news("Government announces education policy for students")
predict_news("NASA confirms water on Mars after new study")



Text: Aliens are controlling elections using 5G towers
Probabilities [Fake, Real]: [0.50786005 0.49213995]
Prediction: Fake News

Text: Government announces education policy for students
Probabilities [Fake, Real]: [0.68491993 0.31508007]
Prediction: Fake News

Text: NASA confirms water on Mars after new study
Probabilities [Fake, Real]: [0.38939901 0.61060099]
Prediction: Real News


'Real News'