In [43]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sabhijeet26\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import PassiveAggressiveClassifier

In [45]:
import pandas as pd
import numpy as np
import re
import string

In [46]:
f = pd.read_csv('fake_news_dataset.csv')
f.head()
df= f.head(80).copy()
df.shape

(80, 6)

In [47]:
def preprocess_text(text):
    # Removing URLs
    text = re.sub(r'http\S+', '', text)
    # Removing punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Converting text to lowercase
    text = text.lower()
    # Tokenizing text
    words = nltk.word_tokenize(text)
    # Removing stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # Joining words back to form text
    text = ' '.join(words)
    return text

In [48]:
df.drop(df[df['text'].apply(type)!=str].index,axis=0,inplace=True)
df[df['text'].apply(type)!=str]
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1


In [49]:
df['text'] = df['text'].apply(preprocess_text)

# Creating feature matrix and target vector
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['label']

In [50]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [51]:
# Training the model
model = PassiveAggressiveClassifier()
model.fit(X_train, y_train)

In [52]:
# Predicting the labels of the test set
y_pred = model.predict(X_test)

In [53]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8125
Confusion Matrix:
 [[9 0]
 [3 4]]


In [54]:
# Function for manual testing
def manual_testing():
    news = input('Enter news text: ')
    processed_news = preprocess_text(news)
    X_manual = tfidf_vectorizer.transform([processed_news])
    y_manual = model.predict(X_manual)
    if y_manual[0] == 0:
        print('The news is real')
    else:
        print('The news is fake')


In [55]:
# Test with some manual inputs
manual_testing()

Enter news text: Narendra Modi is PM of USA
The news is fake
