In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('fake_or_real_news.csv')

In [3]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)


In [5]:
from sklearn.model_selection import train_test_split
#Set `y` 
y = df.label
 
# Drop the `label` column 
df.drop(columns=["label"], inplace=True) 
 
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [6]:
# Creating the model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
                     ])
text_clf.fit(X_train, y_train) 

predicted = text_clf.predict(X_test)


In [17]:
# Accuracy in the train dataset
import numpy as np
from sklearn import metrics
predi = text_clf.predict(X_train)
score = metrics.accuracy_score(y_train, predi)
print("accuracy:   %0.3f" % score)

accuracy:   0.915


In [18]:
# Accuracy in the test dataset
score = metrics.accuracy_score(y_test, predicted)
print("accuracy:   %0.3f" % score)
print("Confusion Matrix")

print("{0}".format(metrics.confusion_matrix(y_test, predicted, labels = ['REAL', "FAKE"])))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, predicted, labels = ['REAL', "FAKE"]))

accuracy:   0.857
Confusion Matrix
[[1052   31]
 [ 269  739]]

Classification Report
              precision    recall  f1-score   support

        REAL       0.80      0.97      0.88      1083
        FAKE       0.96      0.73      0.83      1008

    accuracy                           0.86      2091
   macro avg       0.88      0.85      0.85      2091
weighted avg       0.88      0.86      0.85      2091



In [50]:
import pickle
# Saving the model
filename = 'model_trainned.sav'
pickle.dump(text_clf, open(filename, 'wb'))

In [53]:
loaded_model = pickle.load(open(filename, 'rb'))
resultado1 = loaded_model.predict_proba(X_test[15:16])
resultado2 = loaded_model.predict_proba(X_test[20:21])
print(y_test[15:16].item())
print(resultado1)
print(y_test[15:16].item())
print(resultado2)

REAL
[[0.19780977 0.80219023]]
REAL
[[0.01669546 0.98330454]]
