In [2]:
import os
import re
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
%pip install 

Note: you may need to restart the kernel to use updated packages.


ERROR: You must give at least one requirement to install (see "pip help install")


In [4]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

In [5]:
def main():
    os.makedirs("models", exist_ok=True)

    print("Loading data...")
    fake_data = pd.read_csv("data/dataset_fake.csv", header=None, names=["id", "text"], usecols=["text"], quotechar='"', on_bad_lines="skip")
    real_data = pd.read_csv("data/dataset_true.csv", header=None, names=["id", "text"], usecols=["text"], quotechar='"', on_bad_lines="skip")

    fake_data["label"] = 0
    real_data["label"] = 1

    data = pd.concat([fake_data, real_data], ignore_index=True)
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    data["text"] = data["text"].fillna("")

    print("Cleaning text data...")
    data["text"] = data["text"].apply(clean_text)

    print("Vectorizing text data...")
    tfidf = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=5, ngram_range=(1, 2))
    X_tfidf = tfidf.fit_transform(data["text"])
    y = data["label"]

    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

    print("Training models...")
    models = {
        "random_forest": RandomForestClassifier(n_estimators=200, max_depth=None, n_jobs=-1, random_state=42),
        "logistic_regression": LogisticRegression(max_iter=1000, random_state=42),
        "xgboost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        results[name] = (model, accuracy)
        print(f"{name.replace('_', ' ').title()} Accuracy: {accuracy:.4f}")

    best_model_name, (best_model, best_accuracy) = max(results.items(), key=lambda item: item[1][1])
    print(f"\nBest model: {best_model_name} with accuracy {best_accuracy:.4f}")

    with open("models/fake_news_model.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)

    with open("models/tfidf_vectorizer.pkl", "wb") as vectorizer_file:
        pickle.dump(tfidf, vectorizer_file)

    print("\nModel and vectorizer saved in 'models' folder.")
    print("\nClassification Report:")
    best_preds = best_model.predict(X_test)
    print(classification_report(y_test, best_preds))

In [6]:
if __name__ == "__main__":
    main()

Loading data...
Cleaning text data...
Vectorizing text data...
Training models...
Random Forest Accuracy: 0.9717
Logistic Regression Accuracy: 0.9749


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Xgboost Accuracy: 0.9803

Best model: xgboost with accuracy 0.9803

Model and vectorizer saved in 'models' folder.

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4558
           1       0.99      0.97      0.98      4431

    accuracy                           0.98      8989
   macro avg       0.98      0.98      0.98      8989
weighted avg       0.98      0.98      0.98      8989



In [7]:
import pandas as pd
import numpy as np
import pickle
import warnings

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")


In [10]:
df_true = pd.read_csv(r"C:\Users\DELL\Desktop\Sarfraz (Code_File)\PW Data Science\Project\fake-news-main\data\dataset_true.csv")
df_fake = pd.read_csv(r"C:\Users\DELL\Desktop\Sarfraz (Code_File)\PW Data Science\Project\fake-news-main\data\dataset_fake.csv")

print(df_true.shape, df_fake.shape)



(22097, 2) (22846, 2)


In [11]:
df_true['label'] = 1   # Real
df_fake['label'] = 0   # Fake


In [13]:
print(df_true.columns)
print(df_fake.columns)


Index(['0',
       'The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a ‚Äúfiscal conservative‚Äù on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS‚Äô ‚ÄúFace the Nation,‚Äù drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense ‚Äúdiscretionary‚Äù spending on programs that supp

In [21]:
import pandas as pd

df_true = pd.read_csv(r"C:\Users\DELL\Desktop\Sarfraz (Code_File)\PW Data Science\Project\fake-news-main\data\dataset_true.csv")
df_fake = pd.read_csv(r"C:\Users\DELL\Desktop\Sarfraz (Code_File)\PW Data Science\Project\fake-news-main\data\dataset_fake.csv")

# üî• Auto detect text column
def detect_text_column(df):
    for col in df.columns:
        if col.lower() in ["text", "content", "news", "article", "body", "headline", "title"]:
            return col
    # fallback: first string column
    return df.select_dtypes(include="object").columns[0]

true_col = detect_text_column(df_true)
fake_col = detect_text_column(df_fake)

print("Using TRUE column:", true_col)
print("Using FAKE column:", fake_col)

# Create common 'text' column
df_true["text"] = df_true[true_col]
df_fake["text"] = df_fake[fake_col]

# Labels
df_true["label"] = 1   # Real
df_fake["label"] = 0   # Fake

# Merge datasets
df = pd.concat([df_true, df_fake], axis=0)

# NOW this will NEVER fail
df = df[["text", "label"]]

df.dropna(inplace=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()


Using TRUE column: The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a ‚Äúfiscal conservative‚Äù on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS‚Äô ‚ÄúFace the Nation,‚Äù drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense ‚Äúdiscretionary‚Äù spending on programs that suppo

Unnamed: 0,text,label
0,Former Defense Secretary Robert Gates has judg...,0
1,USA TODAYHAVANA The wheels of Air Force One t...,0
2,Let s get real with some awesome truth from Ma...,0
3,Egypt said on Thursday its air force had hit 1...,1
4,"While in Warsaw, Poland, Obama commented on th...",0


In [22]:
df = pd.concat([df_true, df_fake], axis=0)
df = df[['text', 'label']]
df.dropna(inplace=True)

df = df.sample(frac=1).reset_index(drop=True)  # shuffle

df.head()


Unnamed: 0,text,label
0,22-yr old American Otto Warmbier was arrested ...,0
1,Another successful cleansing of our history ...,0
2,In 1993 Harry Reid wrote the Immigration Stabi...,0
3,"Michael Flynn, former national security advise...",1
4,I guess Trump missed the sticks and stones les...,0


In [24]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [25]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.7
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [26]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [27]:
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9671601914727819
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4569
           1       0.97      0.96      0.97      4414

    accuracy                           0.97      8983
   macro avg       0.97      0.97      0.97      8983
weighted avg       0.97      0.97      0.97      8983



In [28]:
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print("‚úÖ model.pkl & vectorizer.pkl saved successfully")


‚úÖ model.pkl & vectorizer.pkl saved successfully


In [29]:
sample_news = """
Scientists claim a new technology can charge electric vehicles
within ten minutes using low-cost batteries.
"""

sample_vec = vectorizer.transform([sample_news])
prediction = model.predict(sample_vec)[0]

print("Prediction:", "Real" if prediction == 1 else "Fake")


Prediction: Real
