In [None]:
import pandas as pd

df_true = pd.read_csv("../data/True.csv")
df_fake = pd.read_csv("../data/Fake.csv")

df_true["label"] = 1
df_fake["label"] = 0

df = pd.concat([df_true, df_fake], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.columns)
print(df["label"].value_counts())

import string

# Keep only relevant columns
df = df[["text", "label"]]

# Drop missing values if any
df = df.dropna()

# Lowercase
df["text"] = df["text"].str.lower()

# Remove punctuation
df["text"] = df["text"].str.translate(str.maketrans('', '', string.punctuation))

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"])

X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['True', 'Fake'], yticklabels=['True', 'Fake'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

from sklearn.linear_model import LogisticRegression
import joblib

model = LogisticRegression()
model.fit(X_train, y_train)

# Save both model and vectorizer
joblib.dump(model, "fakenews_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
model = joblib.load("fakenews_model.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    return text

def predict_news(text):
    processed = preprocess(text)
    vector = tfidf.transform([processed])
    prediction = model.predict(vector)
    return "real" if prediction[0] == 1 else "fake"

print(predict_news("The World Health Organization (WHO) has issued new guidelines urging countries to ramp up vaccinations in response to a resurgence of measles in several regions. The report highlights a 79% increase in global cases compared to the previous year, largely due to disruptions in healthcare during the COVID-19 pandemic. WHO officials emphasize the importance of catch-up immunization campaigns, especially in areas with low vaccination coverage"))


