In [2]:
import pandas as pd
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

In [3]:
fake["label"] = 0   # 0 = Fake
real["label"] = 1   # 1 = Real

In [4]:
data = pd.concat([fake, real], axis=0)
data = data.sample(frac=1).reset_index(drop=True)
print(data.head())

                                               title  \
0  Attorney General Sessions to talk publicly to ...   
1  Russia and China discuss coordination on North...   
2  HOUSE DEMOCRATS MAKE STUNNING Move To Implemen...   
3  REPUBLICAN TURNS TABLES ON FBI: Deputy Directo...   
4  Britain joins U.S. in blaming North Korea for ...   

                                                text          subject  \
0  WASHINGTON (Reuters) - U.S. Attorney General J...     politicsNews   
1  MOSCOW (Reuters) - The Russian Foreign Ministr...        worldnews   
2  While US citizens are outraged over the remova...         politics   
3   GCHQ director Robert Hannigan is stepping dow...  Government News   
4  LONDON (Reuters) - Britain said North Korea wa...        worldnews   

                 date  label  
0      June 11, 2017       1  
1  December 13, 2017       1  
2         Jan 1, 2016      0  
3         Apr 2, 2017      0  
4  December 19, 2017       1  


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english"))
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)
data['text'] = data['text'].apply(clean_text)
print(data['text'].head())

0    washington reuters us attorney general jeff se...
1    moscow reuters russian foreign ministry said w...
2    us citizens outraged removal christ pretty muc...
3    gchq director robert hannigan stepping two yea...
4    london reuters britain said north korea behind...
Name: text, dtype: object


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text'])
y = data['label']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9298440979955457

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      4715
           1       0.93      0.92      0.93      4265

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980


Confusion Matrix:
 [[4416  299]
 [ 331 3934]]


In [11]:
import joblib
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']