In [3]:
from google.colab import files

# Upload files manually from your system
uploaded = files.upload()


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv


In [4]:
import pandas as pd

fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

print("🔴 FAKE NEWS EXAMPLE:")
print(fake.head())

print("\n🟢 TRUE NEWS EXAMPLE:")
print(true.head())


🔴 FAKE NEWS EXAMPLE:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  

🟢 TRUE NEWS EXAMPLE:
                                               title  \
0  As U.S. budget fight looms, Republican

In [5]:
# Step 1: Add a new column 'label' to each dataframe
fake['label'] = 0  # 0 = fake news
true['label'] = 1  # 1 = real news

# Step 2: Combine both dataframes
data = pd.concat([fake, true])

# Step 3: Shuffle the data (important for training)
data = data.sample(frac=1).reset_index(drop=True)

# Step 4: View the combined data
print("📰 Combined Data:")
print(data.head())

# Optional: Check how many real and fake samples we have
print("\n🧮 Label Counts:")
print(data['label'].value_counts())


📰 Combined Data:
                                               title  \
0   Watch What Happens When This Guy Tries To Pic...   
1  Trump, Abe didn't discuss bilateral FTA: Japan...   
2  Obama spars with Cuba's Castro over human righ...   
3   Trump Announces That Surrogates Won’t Appear ...   
4  McCain warns Trump over staffing Pentagon with...   

                                                text       subject  \
0  Usually, men have all the advantages   they ma...          News   
1  TOKYO (Reuters) - U.S. President Donald Trump ...  politicsNews   
2  HAVANA (Reuters) - U.S. President Barack Obama...  politicsNews   
3  CNN viewers will no longer be entertained by t...          News   
4  WASHINGTON (Reuters) - Senator John McCain war...  politicsNews   

                 date  label  
0      April 11, 2016      0  
1   November 6, 2017       1  
2     March 21, 2016       1  
3    February 1, 2017      0  
4  November 16, 2017       1  

🧮 Label Counts:
label
0    23481
1    

In [6]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Set of common stopwords
stop_words = set(stopwords.words('english'))

# Clean text function
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = ' '.join(word for word in text.split() if word not in stop_words)  # remove stopwords
    return text

# Apply cleaning to the 'text' column
data['clean_text'] = data['text'].apply(clean_text)

# View sample cleaned text
print(data[['text', 'clean_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                                text  \
0  Usually, men have all the advantages   they ma...   
1  TOKYO (Reuters) - U.S. President Donald Trump ...   
2  HAVANA (Reuters) - U.S. President Barack Obama...   
3  CNN viewers will no longer be entertained by t...   
4  WASHINGTON (Reuters) - Senator John McCain war...   

                                          clean_text  
0  usually men advantages make women automaticall...  
1  tokyo reuters us president donald trump japane...  
2  havana reuters us president barack obama pushe...  
3  cnn viewers longer entertained car crashcalibe...  
4  washington reuters senator john mccain warned ...  


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.7)

# Fit and transform the clean_text column
X = vectorizer.fit_transform(data['clean_text'])

# Target variable
y = data['label']


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Step 1: Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train model
model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train, y_train)

# Step 3: Predict on test data
y_pred = model.predict(X_test)

# Step 4: Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy * 100:.2f}%")

# Step 5: Confusion Matrix
print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


✅ Model Accuracy: 99.58%

📊 Confusion Matrix:
[[4715   21]
 [  17 4227]]


In [9]:
def test_news(news_text):
    # Clean the input text using same method
    cleaned = clean_text(news_text)

    # Convert to TF-IDF vector
    vector = vectorizer.transform([cleaned])

    # Predict
    prediction = model.predict(vector)

    # Show result
    if prediction[0] == 0:
        print("🔴 This news is likely: FAKE")
    else:
        print("🟢 This news is likely: REAL")

# 🔍 Try with a sample news text
test_news("Prime Minister announces new scheme to help farmers in rural areas.")


🟢 This news is likely: REAL


In [12]:
# Clean text function (same as earlier, repeat it here)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Prediction function
def predict_news(news):
    # Clean the input text
    cleaned = clean_text(news)

    # Transform to TF-IDF
    vector = vectorizer.transform([cleaned])

    # Predict using trained model
    prediction = model.predict(vector)

    # Display result
    if prediction[0] == 0:
        print("🔴 This news is most likely: FAKE")
    else:
        print("🟢 This news is most likely: REAL")


In [13]:
predict_news("Aliens have landed in Chennai and are demanding biryani!")

🔴 This news is most likely: FAKE


In [14]:
predict_news("Government to ban all vegetables from next Monday.")
predict_news("Scientists confirm that chocolate cures COVID-19 completely.")
predict_news("Aliens seen dancing on Mount Everest in viral video.")
predict_news("Drinking 20 cups of coffee per day makes you immortal, study says.")


🟢 This news is most likely: REAL
🔴 This news is most likely: FAKE
🔴 This news is most likely: FAKE
🔴 This news is most likely: FAKE
