In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Acer
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))
#english ko stopwords print garna lai

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
stop_words=set(stopwords.words('english'))

Data preprocess garne

In [5]:
#load

true_df = pd.read_csv('data/True.csv')
fake_df = pd.read_csv('data/Fake.csv')


In [6]:
#add labels 
true_df['label'] = 1   # Real news
fake_df['label'] = 0   # Fake news


In [7]:
#merge
data = pd.concat([true_df, fake_df], ignore_index=True)

In [8]:
#shuffling
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


In [9]:
data.shape

(44898, 5)

In [10]:
data.head()

Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",0


In [11]:
#null xa ki nai check
data.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [12]:
#merging title and text
data['content']=data['title']+' '+data['text']
print(data['content'])

0         BREAKING: GOP Chairman Grassley Has Had Enoug...
1         Failed GOP Candidates Remembered In Hilarious...
2         Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3        California AG pledges to defend birth control ...
4        AZ RANCHERS Living On US-Mexico Border Destroy...
                               ...                        
44893    Nigeria says U.S. agrees delayed $593 million ...
44894    Boiler Room #62 – Fatal Illusions Tune in to t...
44895    ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...
44896    Republican tax plan would deal financial hit t...
44897    U.N. refugee commissioner says Australia must ...
Name: content, Length: 44898, dtype: object


In [13]:
#data ra label seperate
X=data.drop(columns='label', axis=1)
Y=data['label']


In [14]:
print(X)
print(Y)

                                                   title  \
0       BREAKING: GOP Chairman Grassley Has Had Enoug...   
1       Failed GOP Candidates Remembered In Hilarious...   
2       Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3      California AG pledges to defend birth control ...   
4      AZ RANCHERS Living On US-Mexico Border Destroy...   
...                                                  ...   
44893  Nigeria says U.S. agrees delayed $593 million ...   
44894                  Boiler Room #62 – Fatal Illusions   
44895  ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...   
44896  Republican tax plan would deal financial hit t...   
44897  U.N. refugee commissioner says Australia must ...   

                                                    text          subject  \
0      Donald Trump s White House is in chaos, and th...             News   
1      Now that Donald Trump is the presumptive GOP n...             News   
2      Mike Pence is a huge homophobe. He suppor

Stemming(reduce word into root word)

In [15]:
#!pip install swifter
#import swifter
#data['content'] = data['content'].swifter.apply(stemming)
#(if it takes too long to stem, use swifter for parallel processing )

In [16]:
port_stem=PorterStemmer()

In [17]:
def stemming(content):
    content = re.sub('[^a-zA-Z ]', '', content)  
    content = content.lower()
    words = content.split()
    stemmed_words = [port_stem.stem(word) for word in words if word not in stop_words]
    return ' '.join(stemmed_words)
    

In [18]:
data['content']=data['content'].apply(stemming)

In [19]:
print(data['content'])

0        break gop chairman grassley enough demand trum...
1        fail gop candid rememb hilari mock eulog video...
2        mike penc new dc neighbor hilari troll homopho...
3        california ag pledg defend birth control insur...
4        az rancher live usmexico border destroy nanci ...
                               ...                        
44893    nigeria say us agre delay million fighter plan...
44894    boiler room fatal illus tune altern current ra...
44895    atheist sue governor texa display capitol grou...
44896    republican tax plan would deal financi hit us ...
44897    un refuge commission say australia must stop u...
Name: content, Length: 44898, dtype: object


In [20]:
#data ra label separate
X=data['content'].values
Y=data['label'].values

In [21]:
print(X)

 'fail gop candid rememb hilari mock eulog video donald trump presumpt gop nomine time rememb candid tri hard beat race white hous forget misstep gaff weird sheer idioci candid jeb bush marco rubio john kasich ted cruz ben carson carli fiorinather video make round twitter eulog three fail candid though dead gop might well dead point anyway appropri titl eulog gop peopl make short speech candidatesonc past man actual say jeb bush qualifi presid fake tear journalist comedienn francesca fiorentini say dearli belov gather today commemor candid longer us one man speak amus circusi rendit chopin funer march rememb jeb way jeb hardli knew ye bad polici find way republican morass inde like jeb polici invis sometim although mani felt would win nomin earli best perform often came debat get outandout fight donald trump virtual everyth sun also bad habit defend brother action iraq memor say relat brother one thing know sure kept us safe move marco rubio fiorentini say call mani thing young charm l

In [22]:
print(Y)

[0 0 0 ... 0 1 1]


In [23]:
Y.shape

(44898,)

In [24]:
#converting textual data into numerical data
# from tfidf_from_scratch import TFIDFVectorizerFromScratch 
#documents=X.tolist()
#documents_subset = documents[:500]  # first 500 rows
# vectorizer = TFIDFVectorizerFromScratch()
# X_tfidf_subset = vectorizer.fit_transform(documents_subset)

In [25]:
# vectorizer=TfidfVectorizer(max_features=20000, ngram_range=(1,2))
# vectorizer.fit(X)
# X=vectorizer.transform(X)

# vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
# X_tfidf = vectorizer.fit_transform(X)

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(data['content'])
Y = data['label'].values

In [26]:
print(X)

 'fail gop candid rememb hilari mock eulog video donald trump presumpt gop nomine time rememb candid tri hard beat race white hous forget misstep gaff weird sheer idioci candid jeb bush marco rubio john kasich ted cruz ben carson carli fiorinather video make round twitter eulog three fail candid though dead gop might well dead point anyway appropri titl eulog gop peopl make short speech candidatesonc past man actual say jeb bush qualifi presid fake tear journalist comedienn francesca fiorentini say dearli belov gather today commemor candid longer us one man speak amus circusi rendit chopin funer march rememb jeb way jeb hardli knew ye bad polici find way republican morass inde like jeb polici invis sometim although mani felt would win nomin earli best perform often came debat get outandout fight donald trump virtual everyth sun also bad habit defend brother action iraq memor say relat brother one thing know sure kept us safe move marco rubio fiorentini say call mani thing young charm l

Training ra Testing data split

In [27]:
# X_train, X_test, Y_train, Y_test=train_test_split(
#     X_tfidf,Y, test_size=0.2, stratify=Y, random_state=2
#     )
X_train, X_test, Y_train, Y_test = train_test_split(
    X_tfidf, Y, test_size=0.2, stratify=Y, random_state=42
)

Model training hai

In [28]:
from logistic_regression import LogisticRegressionFromScratchSparse


In [29]:
# model=LogisticRegressionFromScratch(learning_rate=0.001, epochs=500, verbose=True)
# model.fit(X_train, Y_train)
model = LogisticRegressionFromScratchSparse(
    learning_rate=0.1,
    epochs=100,
    batch_size=512,   # can tune batch size
    verbose=True
)

In [30]:
model.fit(X_train, Y_train)

Epoch 1/100, Loss: 0.6839
Epoch 11/100, Loss: 0.6112
Epoch 21/100, Loss: 0.5533
Epoch 31/100, Loss: 0.5066
Epoch 41/100, Loss: 0.4683
Epoch 51/100, Loss: 0.4365
Epoch 61/100, Loss: 0.4096
Epoch 71/100, Loss: 0.3866
Epoch 81/100, Loss: 0.3666
Epoch 91/100, Loss: 0.3492
Epoch 100/100, Loss: 0.3352


In [31]:
y_pred = model.predict(X_test)
accuracy = (y_pred == Y_test).mean()
print("Accuracy on full dataset:", accuracy)

Accuracy on full dataset: 0.9531180400890868


In [32]:
train_acc = np.mean(model.predict(X_train) == Y_train)
test_acc = np.mean(model.predict(X_test) == Y_test)

print("Training Accuracy:", train_acc)
print("Testing Accuracy:", test_acc)

Training Accuracy: 0.953700094660059
Testing Accuracy: 0.9531180400890868


Prediction


In [33]:
X_new=X_test[5]

prediction=model.predict(X_new)
print(prediction)

if (prediction[0]==0):
    print("The news is real")
    
else:
    print("The news is fake")

[0]
The news is real


In [34]:
print(Y_test[5])

0


In [35]:
import pickle

# Save logistic regression model
with open('logistic_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [36]:
def preprocess_news(title, text):
    content = title + ' ' + text
    content = re.sub('[^a-zA-Z ]', '', content)  # remove non-letters
    content = content.lower()
    words = content.split()
    stemmed_words = [port_stem.stem(word) for word in words if word not in stop_words]
    return ' '.join(stemmed_words)


In [37]:
def predict_news(title, text):
    processed_text = preprocess_news(title, text)
    X_new = vectorizer.transform([processed_text])
    prediction = model.predict(X_new)
    return "Real" if prediction[0] == 1 else "Fake"


In [38]:
title = "Bihar On High Alert As 3 Jaish Terrorists Enter Via Nepal, Cops Release Sketch"
text = "Patna:The police in Bihar have issued a high alert after inputs of three terrorists from Pakistan entering the state through Nepal. They have also released the sketch of the three terrorists, who belong to the banned terror outfit Jaish-e-Mohammed.They have been identified as Hasnain Ali, a resident of Rawalpindi, Adil Hussain of Umarkot and Mohammad Usman of Bahawalpur.The police said these terrorists arrived in Nepal's capital Kathmandu in the second week of August and crossed into poll-bound Bihar last week.They have now shared the details of their passports with the officials of the border districts.Security has been increased in Bihar in view of the assembly elections, due later this year.PM Modi's Warning To TerroristsAfter the Pahalgam attack, Prime Minister Narendra Modi had taken a pledge in Bihar to eliminate terrorists behind the April 22 tragedy.The Indian armed forces, after finding cross-border links to the deadly attack, launched Operation Sindoor on May 7 and struck multiple terror camps linked to groups like Jaish-e-Mohammed, Lashkar-e-Taiba, and Hizbul Mujahideen in Pakistan and Pakistan-occupied Kashmir (PoK). Over 100 terrorists were killed in the Indian strikes.Speaking at a rally in Bihar last week, PM Modi said Operation Sindoor was a pledge I took on the soil of Bihar"
result = predict_news(title, text)
print("Prediction:", result)


Prediction: Real


In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# y_test = true labels
# y_pred = predicted labels
# y_proba = predicted probabilities

print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Precision:", precision_score(Y_test, y_pred))
print("Recall:", recall_score(Y_test, y_pred))
print("F1-Score:", f1_score(Y_test, y_pred))



Accuracy: 0.9531180400890868
Precision: 0.9501281752505244
Recall: 0.9516806722689075
F1-Score: 0.9509037900874635
