In [None]:
import pandas as pd
df = pd.read_csv('Cleaned.csv')
df.head()

Unnamed: 0,Eng_Trans_Statement,Eng_Trans_News_Body,Language,Label
0,Fact Check: Old Video Showing Giant Waves Lash...,As Mumbai witnessed heavy rainfall in the past...,English,True
1,Fact Check: Image from Thailand Falsely Shared...,After Prime Minister Narendra Modi inaugurated...,English,False
2,Fact Check: Staged Video Of Child Kidnapping S...,"A video, showing a burqa-clad man posing as a ...",English,True
3,Fact Check: Does This Video Show Maulana Givin...,A video in which a group of children can be se...,English,True
4,Fact Check: Old Image Shared As Hindus Being A...,"Recently in south-western Bangladesh, some uni...",English,False


In [None]:
df['text'] = df['Eng_Trans_Statement'].fillna('') + ' ' + df['Eng_Trans_Statement'].fillna('')
df['label'] = df['Label'].map({True: 1, False: 0})
df.head()

Unnamed: 0,Eng_Trans_Statement,Eng_Trans_News_Body,Language,Label,text,label
0,Fact Check: Old Video Showing Giant Waves Lash...,As Mumbai witnessed heavy rainfall in the past...,English,True,Fact Check: Old Video Showing Giant Waves Lash...,1
1,Fact Check: Image from Thailand Falsely Shared...,After Prime Minister Narendra Modi inaugurated...,English,False,Fact Check: Image from Thailand Falsely Shared...,0
2,Fact Check: Staged Video Of Child Kidnapping S...,"A video, showing a burqa-clad man posing as a ...",English,True,Fact Check: Staged Video Of Child Kidnapping S...,1
3,Fact Check: Does This Video Show Maulana Givin...,A video in which a group of children can be se...,English,True,Fact Check: Does This Video Show Maulana Givin...,1
4,Fact Check: Old Image Shared As Hindus Being A...,"Recently in south-western Bangladesh, some uni...",English,False,Fact Check: Old Image Shared As Hindus Being A...,0


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z0-9\s%$.-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text"] = df["text"].apply(clean_text)
df.head(2)

Unnamed: 0,Eng_Trans_Statement,Eng_Trans_News_Body,Language,Label,text,label
0,Fact Check: Old Video Showing Giant Waves Lash...,As Mumbai witnessed heavy rainfall in the past...,English,True,fact check old video showing giant waves lashi...,1
1,Fact Check: Image from Thailand Falsely Shared...,After Prime Minister Narendra Modi inaugurated...,English,False,fact check image from thailand falsely shared ...,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=15000, stop_words = 'english', ngram_range=(1, 2))

X = vectorizer.fit_transform(df['text'])
y = df['label']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000, class_weight = 'balanced', C=2)

model.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.570929070929071
              precision    recall  f1-score   support

           0       0.46      0.50      0.48       793
           1       0.65      0.62      0.63      1209

    accuracy                           0.57      2002
   macro avg       0.56      0.56      0.56      2002
weighted avg       0.58      0.57      0.57      2002



In [None]:
import pickle

with open('tf_vectorizer.pkl', "wb") as f:
  pickle.dump(vectorizer, f)

with open('log_reg_model.pkl', 'wb') as f:

  pickle.dump(model, f)

In [None]:
with open('tf_vectorizer.pkl', 'rb') as f:
  vector = pickle.load(f)
with open('log_reg_model.pkl', 'rb') as f:
  mod = pickle.load(f)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z0-9\s%$.-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

import pickle
import re
from sklearn.metrics.pairwise import cosine_similarity

with open('tf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('log_reg_model.pkl', 'rb') as f:
    model = pickle.load(f)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z0-9\s%$.-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def predict_news(statement, body):
    clean_statement = clean_text(statement)
    clean_body = clean_text(body)

    stmt_vec = vectorizer.transform([clean_statement])
    body_vec = vectorizer.transform([clean_body])

    similarity = cosine_similarity(stmt_vec, body_vec)[0][0]

    if similarity < 0.1:
        return {
            "error": "Title and news body do not appear related. Please provide matching content.",
            "title_body_similarity": round(float(similarity), 3)
        }

    combined_text = clean_statement + " " + clean_body
    combined_vec = vectorizer.transform([combined_text])

    prediction = model.predict(combined_vec)[0]
    probability = model.predict_proba(combined_vec)[0].max()

    label = "REAL" if prediction == 1 else "FAKE"

    return {
        "prediction": label,
        "confidence": round(float(probability) * 100, 2),
        "title_body_similarity": round(float(similarity), 3)
    }


stam = 'Bondi Beach Shooting: Here Is A Fact-Check Of Social Media Posts Identifying ‘Hero’ Who Disarmed Sydney Gunman'
body = 'Media reports stated that the man who tackled and disarmed one of the shooters was Ahmed Al Ahmed, while the name, “Edward Crabtree”, originated from an article on a dubious website.'
print(predict_news(stam, body))

{'error': 'Title and news body do not appear related. Please provide matching content.', 'title_body_similarity': 0.038}


In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

sbert = SentenceTransformer('all-MiniLM-L6-v2')
df['Eng_Trans_Statement'] = df['Eng_Trans_Statement'].fillna('')
df['Eng_Trans_News_Body'] = df['Eng_Trans_News_Body'].fillna('')

texts = (df['Eng_Trans_Statement'] + " " + df['Eng_Trans_News_Body']).astype(str).tolist()
X = sbert.encode(texts, show_progress_bar=True)
y = df['label'].values

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = LogisticRegression(max_iter=2000, class_weight='balanced')
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.518981018981019
              precision    recall  f1-score   support

           0       0.41      0.49      0.44       793
           1       0.62      0.54      0.58      1209

    accuracy                           0.52      2002
   macro avg       0.51      0.51      0.51      2002
weighted avg       0.53      0.52      0.52      2002



In [None]:
import pickle

with open('semantic_logreg.pkl', 'wb') as f:
    pickle.dump(model, f)


In [None]:
import pickle
import re
from sentence_transformers import SentenceTransformer, util

with open("semantic_logreg.pkl", "rb") as f:
    model = pickle.load(f)

sbert = SentenceTransformer('all-MiniLM-L6-v2')

SIMILARITY_THRESHOLD = 0.45

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def semantic_similarity(text1, text2):
    emb1 = sbert.encode(text1, convert_to_tensor=True)
    emb2 = sbert.encode(text2, convert_to_tensor=True)
    return float(util.cos_sim(emb1, emb2))

def predict_news(statement, body):
    clean_statement = clean_text(statement)
    clean_body = clean_text(body)

    similarity = semantic_similarity(clean_statement, clean_body)

    if similarity < SIMILARITY_THRESHOLD:
        return {
            "error": "Title and body seem unrelated.",
            "semantic_similarity": round(similarity, 3)
        }

    combined_text = clean_statement + " " + clean_body
    embedding = sbert.encode([combined_text])

    prediction = model.predict(embedding)[0]
    probs = model.predict_proba(embedding)[0]

    label = "REAL" if prediction == 1 else "FAKE"
    confidence = round(float(max(probs)) * 100, 2)

    return {
        "prediction": label,
        "confidence": confidence,
        "semantic_similarity": round(similarity, 3)
    }


stam = "Did Gautam Gambhir Confirm Virat Kohli, Rohit Sharma For 2027 ODI World Cup?"
body = "A viral video shows Indian cricket coach Gautam Gambhir praising Virat Kohli’s form and saying that Rohit Sharma and Kohli will play for India in the 2027 ODI World Cup."

print(predict_news(stam, body))

{'prediction': 'REAL', 'confidence': 51.44, 'semantic_similarity': 0.762}
