In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
dataset=pd.read_csv('news.csv')
x=dataset['text']
y=dataset['label']

In [3]:
dataset.isnull().any()


Unnamed: 0    False
title         False
text          False
label         False
dtype: bool

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8,random_state=40)


In [6]:
x_train.shape

(5068,)

In [7]:
y_test.shape

(1267,)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english',max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)

In [10]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [11]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)

In [12]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [13]:
score=accuracy_score(y_test,y_pred)
print(f'Accuarcy:{round(score*100,2)}%')

Accuarcy:94.0%


In [14]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                    ('nbmodel', MultinomialNB())])


In [15]:
pipeline

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [16]:
pipeline.fit(x_train, y_train)


Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [17]:
pred = pipeline.predict(x_test)

In [18]:
pred.shape

(1267,)

In [19]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.97      0.68      0.80       648
        REAL       0.75      0.98      0.85       619

    accuracy                           0.83      1267
   macro avg       0.86      0.83      0.82      1267
weighted avg       0.86      0.83      0.82      1267



In [20]:
print(confusion_matrix(y_test, pred))

[[441 207]
 [ 12 607]]


In [21]:
with open('model.pickle', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)
    