In [5]:
# Import necessary libraries first
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load and prepare datasets
df_true = pd.read_csv("../data/True.csv")
df_fake = pd.read_csv("../data/Fake.csv")
df_true["label"] = 1
df_fake["label"] = 0
df = pd.concat([df_true, df_fake], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display dataset information
print(df.columns)
print(df["label"].value_counts())

# Keep only relevant columns and preprocess
df = df[["text", "label"]]
df = df.dropna()
df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.translate(str.maketrans('', '', string.punctuation))

# Create TF-IDF features and split data
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"])
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Define and train model (this should come BEFORE evaluation)
model = LogisticRegression()
model.fit(X_train, y_train)

# Now evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the model and vectorizer
joblib.dump(model, "fakenews_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def predict_news(text):
    processed = preprocess(text)
    vector = tfidf.transform([processed])
    prediction = model.predict(vector)
    return "real" if prediction[0] == 1 else "fake"

test_article = "The World Health Organization (WHO) has issued new guidelines urging countries to ramp up vaccinations in response to a resurgence of measles in several regions. The report highlights a 79% increase in global cases compared to the previous year, largely due to disruptions in healthcare during the COVID-19 pandemic. WHO officials emphasize the importance of catch-up immunization campaigns, especially in areas with low vaccination coverage"
print(predict_news(test_article))

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')
label
0    23481
1    21417
Name: count, dtype: int64
Accuracy: 0.9855233853006682

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4669
           1       0.98      0.99      0.98      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

real
