In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# nltk.download('stopwords')

df_true = pd.read_csv(r"C:\Users\HP\Downloads\Project data\True.csv")
df_fake = pd.read_csv(r"C:\Users\HP\Downloads\Project data\Fake.csv")
df_hindi = pd.read_excel(r"C:\Users\HP\Downloads\Project data\bharatfakenewskosh 4 - Copy.xlsx")

df_true["Label"] = 1
df_fake["Label"] = 0

df_marge = pd.concat([df_true, df_fake], axis=0)

df_marge['content'] = df_marge['title'].fillna('') + ' ' + df_marge['text'].fillna('')
df_marge = df_marge.drop(['title', 'text', 'subject', 'date'], axis=1)

df_hindi['Label'] = df_hindi['Label'].astype(str).str.strip().str.lower()
df_hindi['Label'] = df_hindi['Label'].map({'false': 0, 'true': 1})

df_hindi['content'] = (
    df_hindi['Statement'].fillna('') + ' ' + df_hindi['Eng_Trans_Statement'].fillna('') + ' ' +
    df_hindi['News Body'].fillna('') + ' ' + df_hindi['Eng_Trans_News_Body'].fillna('')
)

df_hindi = df_hindi.drop([
    'Statement', 'Eng_Trans_Statement', 'News Body', 'Eng_Trans_News_Body',
    'id','Author_Name','Fact_Check_Source','Source_Type','Media_Link','Publish_Date',
    'News_Category','Language','Region','Platform','Text','Video','Image'
], axis=1, errors='ignore')

df_marge['Link'] = 'No link available'

if 'Fact_Check_Link' in df_hindi.columns:
    df_hindi['Link'] = df_hindi['Fact_Check_Link'].fillna('No link available')
else:
    df_hindi['Link'] = 'No link available'

df_hindi = df_hindi.drop(['Fact_Check_Link'], axis=1, errors='ignore')

new_merged_df = pd.concat([df_marge, df_hindi], ignore_index=True)

stop_words_en = set(stopwords.words('english'))

# Hindi stopwords (common list)
hindi_stopwords = set([
    'है', 'और', 'से', 'के', 'को', 'पर', 'में', 'की', 'का', 'कि', 'यह', 'वे', 'वह', 'तो', 'था', 'थे', 'द्वारा',
    'हैं', 'इन', 'उन', 'या', 'जो', 'तक', 'भी', 'जब', 'जैसे', 'तकनीक', 'नहीं', 'कर', 'किया', 'करना', 'हो', 'होता',
    'रहा', 'रही', 'रहे', 'हुआ', 'होती', 'होने', 'लिए', 'इसे', 'इसी', 'इन्हें', 'इन्हीं', 'उसे', 'उसी', 'उन्हें',
    'उन्हीं', 'कुछ', 'किसी', 'किसे', 'किसीं', 'कौन', 'कौनसा', 'जहाँ', 'जहां', 'जिधर', 'जिसे', 'जिसने', 'जो', 'तक',
    'तब', 'तभी', 'ताकि', 'तुम', 'तुम्हारा', 'तेरा', 'तेरी', 'तेरे', 'तो', 'था', 'थी', 'थे', 'दिया', 'दिये', 'द्वारा',
    'न', 'ना', 'नीचे', 'ने', 'पर', 'पहले', 'पूरा', 'फिर', 'बनी', 'बना', 'बनीं', 'बने', 'बार', 'भी', 'मात्र', 'में',
    'यदि', 'यह', 'यहाँ', 'यहां', 'यही', 'यद्यपि', 'यदि', 'या', 'ये', 'रखना', 'रखें', 'रखते', 'सकता', 'सकती', 'सकते',
    'सबसे', 'सभी', 'साथ', 'साभार', 'से', 'सो', 'सही', 'है', 'हैं', 'हो', 'होता', 'होती', 'होते', 'होगा', 'होगी', 'होगे'
])

stemmer = PorterStemmer()

def preprocess_mixed_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\u0900-\u097F]', '', text)  # keep Hindi chars + English words

    words = text.split()
    cleaned_words = []
    for word in words:
        if re.match(r'^[a-zA-Z]+$', word):  # English word
            if word not in stop_words_en:
                word = stemmer.stem(word)
                cleaned_words.append(word)
        elif re.match(r'^[\u0900-\u097F]+$', word):  # Hindi word
            if word not in hindi_stopwords:
                cleaned_words.append(word)
        else:  # skip numbers/mixed
            continue
    return ' '.join(cleaned_words)

new_merged_df['clean_content'] = new_merged_df['content'].apply(preprocess_mixed_text)

X = new_merged_df['clean_content']
y = new_merged_df['Label']

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(X) # training model

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=30)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

def test_with_input(model, tfidf_vectorizer, input_text):
    processed_text = preprocess_mixed_text(input_text)
    input_tfidf = tfidf_vectorizer.transform([processed_text])
    prediction = model.predict(input_tfidf)
    return "Real News" if prediction[0] == 1 else "Fake News"

def get_fact_check_link(input_text, dataset, top_n=1, similarity_threshold=0.3):
    processed_text = preprocess_mixed_text(input_text)
    input_vector = tfidf_vectorizer.transform([processed_text])
    dataset_vectors = tfidf_vectorizer.transform(dataset['clean_content'])
    similarities = cosine_similarity(input_vector, dataset_vectors).flatten()
    top_indices = similarities.argsort()[::-1]
    
    for idx in top_indices:
        best_similarity = similarities[idx]
        link = dataset.iloc[idx]['Link']
        if best_similarity >= similarity_threshold and link != "No link available":
            return link  
    
    return "No fact-check link available"

input_text = "The government announced a new policy to boost agriculture in rural areas."
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")

link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")

print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



 Prediction: Real News
 Fact Check Link: https://www.factchecker.in/1-in-2-women-aged-15-49-1-in-2-pregnant-women-in-rural-areas-anaemic-in-2015-16/

 Accuracy: 0.8312

Confusion Matrix:
[[7884 2329]
 [1273 9853]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [5]:

input_text = """
Hyderabad:
Slamming Pakistan once again for the terror attack at Pahalgam, All India Majlis-e-Ittehadul Muslimeen (AIMIM) president and Hyderabad MP Asaduddin Owaisi on Sunday said that Pakistan is half a century behind India. 
Mr Owaisi was addressing a public meeting at Prabhani in Maharashtra to oppose the Waqf (Amendment) Act.
The Hyderabad MP came down heavily on Pakistan for the terror attack. He also dismissed the threats of Pakistani leaders. “You are not just half an hour behind, you are half a century behind India. Your country's budget is not even equal to our military budget,” he said.
“Pakistan repeatedly says that they have nuclear bombs, atomic bombs. Remember, if you go into another country and kill innocent people, no country will remain silent."
The AIMIM chief reiterated that terrorists asked the religion of tourists at Pahalgam before killing them. “Which religion are you talking about? You are worse than the Khawarij. This act shows you are the successors of ISIS,” he said.
"""                
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")

link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")
print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


 Prediction: Real News
 Fact Check Link: https://digiteye.in/hyderabad-mp-owaisi-claims-india-gate-has-muslim-martyrs-names-during-freedom-struggle-fact-check/

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [5]:

input_text = """
अग्निपथ योजना का विरोध: क्या ‘शमी असलम’ ने हिंसा भड़काने वाली चैट में खुद को हिंदू बताया?
"""            
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")
link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")
print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


 Prediction: Fake News
 Fact Check Link: https://www.altnews.in/Hindi/old-video-shared-as-jharkhand-lynching-victim-tabrez-ansari-procession/

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [10]:
input_text = """
बीते दिनों भाजपा नेता व सांसद मनोज तिवारी और दिल्ली के भाजपा प्रदेश अध्यक्ष आदेश गुप्ता ने कई मीडियकर्मियों के साथ दिल्ली के किराड़ी विधानसभा में मौजूद एक अस्पताल के…
"""            
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")
link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")
print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


 Prediction: Real News
 Fact Check Link: https://www.altnews.in/Hindi/delhi-kirari-458-bed-hospital-made-by-2020-false-claim-by-manoj-tiwari/

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [13]:

input_text = """
WASHINGTON (Reuters) - The special counsel investigation of links between Russia and President Trumpâ€™s 2016 
election campaign should continue without interference in 2018, despite calls from some Trump administration allies 
and Republican lawmakers to shut it down, a prominent Republican senator said on Sunday. Lindsey Graham, who serves on 
the Senate armed forces and judiciary committees, said Department of Justice Special Counsel Robert Mueller needs to carry 
on with his Russia investigation without political interference. â€œThis investigation will go forward. It will be an investigation 
conducted without political influence,â€ Graham said on CBSâ€™s Face the Nation news program. â€œAnd we all need to let Mr. Mueller do his job.
I think heâ€™s the right guy at the right time.â€  The question of how Russia may have interfered in the election, and how Trumpâ€™s campaign may
have had links with or co-ordinated any such effort, has loomed over the White House since Trump took office in January. It shows no sign of receding
as Trump prepares for his second year in power, despite intensified rhetoric from some Trump allies in recent weeks accusing Muellerâ€™s team of bias against the Republican president. 
Trump himself seemed to undercut his supporters in an interview last week with the New York Times in which he said he expected Mueller was â€œgoing to be fair.â€    Russiaâ€™s role in the election and the question of possible links to the Trump campaign are the focus of multiple inquiries in Washington. Three committees of the Senate and the House of Representatives are investigating, as well as Mueller, whose team in May took over an earlier probe launched by the U.S. Federal Bureau of Investigation (FBI). Several members of the Trump campaign and administration have been convicted or indicted in the investigation.  Trump and his allies deny any collusion with Russia during the campaign, and the Kremlin has denied meddling in the election. Graham said he still wants an examination of the FBIâ€™s use of a dossier on links between Trump and Russia that was compiled by a former British spy, Christopher Steele, which prompted Trump allies and some Republicans to question Muellerâ€™s inquiry.   On Saturday, the New York Times reported that it was not that dossier that triggered an early FBI probe, but a tip from former Trump campaign foreign policy adviser George Papadopoulos to an Australian diplomat that Russia had damaging information about former Trump rival Hillary Clinton.  â€œI want somebody to look at the way the Department of Justice used this dossier. It bothers me greatly the way they used it, and I want somebody to look at it,â€ Graham said. But he said the Russia investigation must continue. â€œAs a matter of fact, it would hurt us if we ignored it,â€ he said. 

"""            
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")

link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")

print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



 Prediction: Real News
 Fact Check Link: No fact-check link available

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [70]:
input_text = """
On Christmas day, Donald Trump announced that he would  be back to work  the following day, 
but he is golfing for the fourth day in a row. The former reality show star blasted former President 
Barack Obama for playing golf and now Trump is on track to outpace the number of golf games his predecessor played.Updated my tracker of Trump 
s appearances at Trump properties.71 rounds of golf including today s. At this pace, he ll pass Obama s first-term total by July 24 next year. 
https://t.co/Fg7VacxRtJ pic.twitter.com/5gEMcjQTbH  Philip Bump (@pbump) December 29, 2017 That makes what a Washington Post reporter discovered
on Trump s website really weird, but everything about this administration is bizarre AF. The coding contained a reference to Obama and golf:
Unlike Obama, we are working to fix the problem   and not on the golf course.  However, the coding wasn t done correctly.The website of Donald Trump,
who has spent several days in a row at the golf course, is coded to serve up the following message in the event of an internal server error:
https://t.co/zrWpyMXRcz pic.twitter.com/wiQSQNNzw0  Christopher Ingraham (@_cingraham) December 28, 2017That snippet of code appears to be on 
all https://t.co/dkhw0AlHB4 pages, which the footer says is paid for by the RNC? pic.twitter.com/oaZDT126B3  Christopher Ingraham (@_cingraham) 
December 28, 2017It s also all over https://t.co/ayBlGmk65Z. As others have noted in this thread, this is weird code and it s not clear it would ever actually display, but who knows.
Christopher Ingraham (@_cingraham) December 28, 2017After the coding was called out, the reference to Obama was deleted.UPDATE: The golf error message has been removed from the Trump and GOP websites. They also fixed the javascript  =  vs  ==  problem. Still not clear when these messages would actually display, since the actual 404 (and presumably 500) page displays a different message pic.twitter.com/Z7dmyQ5smy  Christopher Ingraham (@_cingraham) December 29, 2017That suggests someone at either RNC or the Trump admin is sensitive enough to Trump s golf problem to make this issue go away quickly once people noticed. You have no idea how much I d love to see the email exchange that led us here.  Christopher Ingraham (@_cingraham) December 29, 2017 The code was f-cked up.The best part about this is that they are using the  =  (assignment) operator which means that bit of code will never get run. If you look a few lines up  errorCode  will always be  404          (@tw1trsux) December 28, 2017trump s coders can t code. Nobody is surprised.  Tim Peterson (@timrpeterson) December 28, 2017Donald Trump is obsessed with Obama that his name was even in the coding of his website while he played golf again.Photo by Joe Raedle/Getty Images.

"""            
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")
link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")
print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


 Prediction: Fake News
 Fact Check Link: No fact-check link available

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [74]:

input_text = """
New Delhi:
Defence Minister Rajnath Singh today said it is his responsibility to work with the armed forces and give a "befitting reply" to those who cast an evil eye on India.

The comments came amid the tension with Pakistan following the terror attack in Jammu and Kashmir's Pahalgam that killed 26 tourists.

India has said the terrorists have cross-border linkages.

"I want to assure you that under Prime Minister Modi's leadership, what you desire will certainly happen," the Defence Minister said, alluding to what people across the nation have been talking about the need for a response to Pakistan.

"As the defence minister, it is my responsibility to work with my soldiers and ensure the protection of the country's borders. And it is my responsibility to give a befitting reply, by working with the armed forces, to those who cast an evil eye on our country," Mr Singh said.
"""            
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")

link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")

print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



 Prediction: Real News
 Fact Check Link: https://www.opindia.com/2020/06/cnn-news-18-rajnath-singh-fake-news-misquoting-ladakh-standoff/

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [21]:

input_text = """
भारत के ऑपरेशन सिंदूर के बाद पाकिस्तान बरगलाया हुआ है और भारत पर हमले की कोशिश कर रहा है. पाकिस्तान ने बीती रात भारत की कई जगहों पर ड्रोन से हमले की क... https://www.aajtak.in/education/knowledge/story/india-attack-on-pakistan-why-relying-on-chinese-items-is-proving-dangerous-for-pakistan-tedu-dskc-2236135-2025-05-09

"""         
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")

link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")

print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



 Prediction: Real News
 Fact Check Link: https://www.altnews.in/Hindi/video-from-2009-shared-as-attack-on-indian-army-convoy-by-angry-locals/

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339



In [9]:

input_text = """
Republicans are working overtime trying to sell their scam of a tax bill to the public as something that directly targets middle-class and working-class families with financial relief. Nothing could be further from the truth, and they re getting hammered on that repeatedly. Speaking on CNBC, Paul Ryan was going full throttle, trying to convince us that the paltry savings we re getting is actually wait for it big money.But he didn t just go with the usual talking points. With a smug look that only someone who grew up in a wealthy family can muster when talking about that which he does not know, Ryan claimed that the $2,059 more per year that families living paycheck-to-paycheck will see is extremely significant. Then he decided he had to amend that to say such savings might be nothing to a family earning $600,000 per year (true), or for people living in New York or California (false).Those are the same two states that Trump s loyal subjects insist on stripping from the 2016 vote totals to claim that Trump actually won the popular vote. Watch Ryan completely dismiss all the struggling families living in blue states below:If you re living paycheck-to-paycheck which is more than half of the people in this country and you got #2059more from a tax cut next year, that s not nothing. pic.twitter.com/8TKtrMqRa1  Paul Ryan (@SpeakerRyan) December 21, 2017Someone needs to reach through their computer or television and wipe that smugness off his face. It is the height of arrogance and insult to imply that there are no struggling families in either of those two states.Featured image via Mark Wilson/Getty Images

"""    
prediction = test_with_input(model, tfidf_vectorizer, input_text)
print(f"\n Prediction: {prediction}")

link_result = get_fact_check_link(input_text, new_merged_df)
print(f" Fact Check Link: {link_result}")

print(f"\n Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



 Prediction: Fake News
 Fact Check Link: No fact-check link available

 Accuracy: 0.8312

Confusion Matrix:
[[7880 2333]
 [1270 9856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     10213
           1       0.81      0.89      0.85     11126

    accuracy                           0.83     21339
   macro avg       0.83      0.83      0.83     21339
weighted avg       0.83      0.83      0.83     21339

