In [None]:
from google.colab import drive,files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# Download stopwords if you haven't already
nltk.download('stopwords')

# The path will look something like this. Paste the exact path you copied.
file_path = '/content/drive/MyDrive/WELFake_Dataset.csv'

# Load the dataset using the new path
news_dataset = pd.read_csv(file_path)

# Check the first 5 rows to confirm it loaded correctly
print(news_dataset.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  


In [None]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
news_dataset.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
print(news_dataset.isnull().sum())

title    558
text      39
label      0
dtype: int64


In [None]:
news_dataset = news_dataset.fillna('')

In [None]:
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['text']

In [None]:
X = news_dataset['content']
Y = news_dataset['label']

print("Successfully created 'content' column by combining 'title' and 'text'.")

Successfully created 'content' column by combining 'title' and 'text'.


In [None]:
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

def enhanced_preprocessing(content):
    # Original stemming
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    words = stemmed_content.split()
    stemmed_words = [port_stem.stem(word) for word in words if word not in stop_words]
    processed_text = ' '.join(stemmed_words)

    # Linguistic features
    features = []
    features.append(f"EXCL_{min(content.count('!'), 10)}")  # Capped at 10
    features.append(f"QUEST_{min(content.count('?'), 10)}")

    # ALL CAPS detection
    all_caps = sum(1 for word in words if word.isupper() and len(word) > 1)
    features.append(f"CAPS_{min(all_caps, 5)}")  # Capped at 5

    # Quote detection
    quotes = content.count('"') + content.count("'")
    features.append(f"QUOTES_{min(quotes, 10)}")

    return processed_text + ' ' + ' '.join(features)

print("Applying enhanced preprocessing...")
X = X.apply(enhanced_preprocessing)

Applying enhanced preprocessing...


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=3,        # Increased from 2 to filter rare terms
    max_df=0.95,     # New: filter overly common terms
    sublinear_tf=True
)

# Classifier with optimized parameters
classifier = LinearSVC(
    C=0.25,
    class_weight='balanced',
    max_iter=10000,
    penalty='l2',     # Default but explicit
    loss='squared_hinge',  # More stable than hinge
    dual=False
)
# Create pipeline
pipeline = make_pipeline(vectorizer, classifier)

# Train
print("Training model...")
pipeline.fit(X_train, Y_train)
print("Model training complete.")

# Evaluate
preds = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, preds)
print("\n" + "="*50)
print(f"Final Accuracy: {accuracy:.4f}")
print("="*50)


Training model...
Model training complete.

Final Accuracy: 0.9755


In [None]:
model_filename = 'fake_news_model.joblib'
joblib.dump(pipeline, model_filename)
print(f"Model saved as {model_filename}. Starting download...")

files.download(model_filename)

Model saved as fake_news_model.joblib. Starting download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def predict_news_veracity(headline, content):
    combined = headline + " " + content
    processed = enhanced_preprocessing(combined)
    prediction = pipeline.predict([processed])[0]

    return "Prediction: This is likely Real News." if prediction == 1 else "Prediction: This is likely Fake News."


my_headline = "US trade team’s India visit postponed as Trump hikes tariffs, deal stuck on agri: Report"
my_content = """A planned visit by a US delegation to India later this month for the next round of talks on a proposed bilateral trade agreement is likely to be deferred, PTI reported on Saturday, citing an official who did not wish to be named.

The sixth round of negotiations between the two countries was scheduled to take place in New Delhi from August 25 to 29. So far, five rounds have been completed.“This visit is likely to be rescheduled,” the official told PTI.

The development comes days after US President Donald Trump announced fresh tariffs on India, including an additional 25 per cent duty linked to New Delhi’s continued purchase of Russian oil, taking the total tariff burden to 50 per cent.

While negotiations are underway, Washington has been pushing for greater market access in politically sensitive areas such as dairy and agriculture. India, however, has made it clear that it will not accept terms that affect the livelihood of its farmers and cattle rearers.

Both countries have expressed intent to conclude the first phase of the bilateral trade agreement (BTA) by the fall, with an aim to double bilateral trade from the current USD 191 billion to USD 500 billion by 2030."""
# Get the prediction
my_prediction = predict_news_veracity(my_headline, my_content)
print(my_prediction)

Prediction: This is likely Fake News.
