In [1]:
# Importing the libraries
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
# Importing the cleaned file containing the text and label
news = pd.read_csv('news.csv')
X = news['text']
y = news['label']

In [3]:
# Splitting the data into train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# Creating a pipeline that first creates bag of words(after applying stopwords) & then applies Multinomial Naive Bayes model
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), 
                     ('nbmodel', MultinomialNB())])

In [5]:
# Training our data
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [6]:
# Predicting the label for the test data
pred = pipeline.predict(X_test)

In [7]:
# Checking the performance of our model
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.97      0.74      0.84       606
        REAL       0.80      0.98      0.88       661

    accuracy                           0.86      1267
   macro avg       0.88      0.86      0.86      1267
weighted avg       0.88      0.86      0.86      1267

[[448 158]
 [ 16 645]]


In [8]:
# Serialising the file
with open('model.pickle', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)