In [11]:
!pip install nltk scikit-learn chardet




In [12]:
import pandas as pd
import numpy as np
import re
import nltk
import chardet

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import pickle


In [13]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# Detect file encoding
with open("fake_news.csv", "rb") as f:
    encoding = chardet.detect(f.read())['encoding']

print("Detected Encoding:", encoding)

# Load dataset
df = pd.read_csv("fake_news.csv", encoding=encoding)

print(df.head())
print(df.columns)


Detected Encoding: utf-8
                 author                      published  \
0     Barracuda Brigade  2016-10-26T21:41:00.000+03:00   
1  reasoning with facts  2016-10-29T08:47:11.259+03:00   
2     Barracuda Brigade  2016-10-31T01:41:49.479+02:00   
3                Fed Up  2016-11-01T05:22:00.000+02:00   
4                Fed Up  2016-11-01T21:56:00.000+02:00   

                                               title  \
0  muslims busted they stole millions in govt ben...   
1  re why did attorney general loretta lynch plea...   
2  breaking weiner cooperating with fbi on hillar...   
3  pin drop speech by father of daughter kidnappe...   
4  fantastic trumps  point plan to reform healthc...   

                                                text language  \
0  print they should pay all the back all the mon...  english   
1  why did attorney general loretta lynch plead t...  english   
2  red state  \nfox news sunday reported this mor...  english   
3  email kayla mueller was a 

In [15]:
print(df.columns)


Index(['author', 'published', 'title', 'text', 'language', 'site_url',
       'main_img_url', 'type', 'label', 'title_without_stopwords',
       'text_without_stopwords', 'hasImage'],
      dtype='object')


In [16]:
# Remove unwanted index columns
df = df.loc[:, ~df.columns.str.contains('Unnamed')]


In [17]:
possible_text_cols = ['text', 'content', 'statement', 'news', 'article']

text_col = None
for col in possible_text_cols:
    if col in df.columns:
        text_col = col
        break

print("Text column:", text_col)


Text column: text


In [18]:
possible_label_cols = ['label', 'class', 'target']

label_col = None
for col in possible_label_cols:
    if col in df.columns:
        label_col = col
        break

print("Label column:", label_col)


Label column: label


In [19]:
if 'title' in df.columns and text_col:
    df['text'] = df['title'] + " " + df[text_col]
else:
    df['text'] = df[text_col]

df['label'] = df[label_col]


In [20]:
df = df[['text', 'label']]


In [21]:
print(df.isnull().sum())
df.dropna(inplace=True)

print(df['label'].value_counts())


text     46
label     1
dtype: int64
label
Fake    1292
Real     758
Name: count, dtype: int64


In [22]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)


In [23]:
df['clean_text'] = df['text'].apply(clean_text)


In [24]:
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(df['clean_text'])
y = df['label']


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [26]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [27]:
cv_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring='f1'
)

print("Average F1 Score:", cv_scores.mean())



Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_response.py", line 207, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['Fake' 'Real']



Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "/usr/lo

Average F1 Score: nan



Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_response.py", line 207, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['Fake' 'Real']



Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "/usr/lo

In [31]:
df['label'] = df['label'].map({
    'Fake': 0,
    'Real': 1
})


In [32]:
print(df['label'].value_counts())
print(df['label'].unique())


label
0    1292
1     758
Name: count, dtype: int64
[1 0]


In [33]:
# Save the trained Logistic Regression model
pickle.dump(model, open("model.pkl", "wb"))

# Save the TF-IDF vectorizer
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [34]:
def predict_news(text):
    # Clean text
    cleaned = clean_text(text)

    # Transform text using saved vectorizer
    vector = vectorizer.transform([cleaned])

    # Predict label
    prediction = model.predict(vector)[0]

    # Predict confidence (probability)
    confidence = max(model.predict_proba(vector)[0])

    # Return readable result
    return ("REAL" if prediction == 1 else "FAKE"), confidence


In [35]:
sample_text = "Scientists discovered water on Mars."
result, confidence = predict_news(sample_text)

print("Prediction:", result)
print("Confidence:", confidence)


Prediction: FAKE
Confidence: 0.7184319855056116


In [36]:
import streamlit as st
import pickle
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load saved model and vectorizer
model = pickle.load(open("model.pkl", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

# Streamlit App
st.title("Fake News Detection System")
user_input = st.text_area("Enter social media news/post here:")

if st.button("Analyze"):
    cleaned = clean_text(user_input)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    confidence = max(model.predict_proba(vector)[0])

    if prediction == 1:
        st.success(f"REAL NEWS (Confidence: {confidence:.2f})")
    else:
        st.error(f"FAKE NEWS (Confidence: {confidence:.2f})")


2026-02-01 13:06:45.493 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2026-02-01 13:06:45.501 Session state does not function when running a script without `streamlit run`


In [38]:
!streamlit run app.py


Usage: streamlit run [OPTIONS] [TARGET] [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py


In [42]:
from google.colab import files

files.download('app.py')
files.download('model.pkl')
files.download('vectorizer.pkl')


FileNotFoundError: Cannot find file: app.py