In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
df = pd.read_csv("/content/news_data.csv")

In [5]:
df.head()

Unnamed: 0,label,text
0,FAKE,Aliens spotted helping with crop harvest in Ka...
1,FAKE,Secret to immortality found in Himalayan herb.
2,FAKE,Elvis Presley seen alive in a Las Vegas diner.
3,FAKE,Moon landing was staged in a Hollywood studio.
4,FAKE,"Vampires discovered in Romanian caves, scienti..."


In [6]:
# checking the number of rows and columns of dataset
df.shape

(85, 2)

In [7]:
#checking the missing values in dataset
df.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [8]:
# check distribution
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
FAKE,45
REAL,40


In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
# Text Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [11]:
# Applying cleaning
df['clean_text'] = df['text'].apply(clean_text)

In [12]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_df=0.7)

In [13]:
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

In [14]:
# Splitting the data into train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [15]:
# training the passive aggresive classifier
pac = PassiveAggressiveClassifier(max_iter=50)

In [16]:
pac.fit(X_train, y_train)

In [18]:
# model evaluation
y_pred = pac.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8823529411764706


In [20]:
f1 = f1_score(y_test, y_pred, pos_label='FAKE')
print("F1 Score:", f1)

F1 Score: 0.8888888888888888


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        FAKE       0.89      0.89      0.89         9
        REAL       0.88      0.88      0.88         8

    accuracy                           0.88        17
   macro avg       0.88      0.88      0.88        17
weighted avg       0.88      0.88      0.88        17



In [22]:
# training the support vector machine
svm = LinearSVC()
svm.fit(X_train, y_train)

svm_pred = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM F1:", f1_score(y_test, svm_pred, pos_label='FAKE'))

SVM Accuracy: 0.7058823529411765
SVM F1: 0.7619047619047619


In [23]:
# testing and custome news
news = ["Government launches new AI education policy in India"]

news_clean = [clean_text(news[0])]
news_vector = tfidf.transform(news_clean)

prediction = pac.predict(news_vector)
print("Prediction:", prediction[0])

Prediction: REAL
