In [1]:
import pandas as pd
import string
import joblib
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# 1. Load the dataset
df = pd.read_csv('../data/news.csv')
print("Data shape:", df.shape)
print(df.head())

# 2. Clean the text
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['clean_text'] = df['text'].apply(clean_text)

# 3. Prepare input (X) and output (y)
X = df['clean_text']
y = df['label']

# 4. Convert text to numeric using TF-IDF
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_df=0.7)
X_vec = vectorizer.fit_transform(X)

# 5. Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# 6. Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# 7. Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("✅ Model accuracy:", accuracy)

from sklearn.metrics import classification_report, confusion_matrix

print("\n📄 Classification Report:")
print(classification_report(y_test, y_pred))

print("\n🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# 8. Save model and vectorizer
joblib.dump(model, '../model/fake_news_model.pkl')
joblib.dump(vectorizer, '../model/tfidf_vectorizer.pkl')
print("✅ Model and vectorizer saved.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data shape: (44898, 3)
                                               title  \
0  SHE’S B-A-A-A-CKKKK!! Hillary Makes Crazy Vide...   
1  BOOM! CAMPING WORLD CEO TELLS TRUMP SUPPORTERS...   
2  Factbox: Mexican tycoon Carlos Slim and the U....   
3  South Korea fears further missile advances by ...   
4  Eastern Congo rebels aim to march on Kinshasa:...   

                                                text label  
0  Hey President Trump how s that Hillary email i...  FAKE  
1  Marcus Lemonis has no problem with some custom...  FAKE  
2  MEXICO CITY (Reuters) - Republican presidentia...  REAL  
3  SEOUL (Reuters) - North Korea may conduct addi...  REAL  
4  GOMA, Democratic Republic of Congo (Reuters) -...  REAL  
✅ Model accuracy: 0.9337416481069042

📄 Classification Report:
              precision    recall  f1-score   support

        FAKE       0.93      0.94      0.94      4558
        REAL       0.94      0.93      0.93      4422

    accuracy                           0.93    