In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [3]:

# Load the datasets
true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

# Combine the datasets and add a label
true_news['label'] = 1  # Label for true news
fake_news['label'] = 0  # Label for fake news

# Combine into a single dataframe
news_data = pd.concat([true_news, fake_news], ignore_index=True)


In [4]:

# List of basic stopwords (you can use a larger list if needed)
stopwords = set(["the", "and", "a", "is", "in", "it", "for", "to", "of", "that", "on", "this", "with", "as", "by", "at", "from", "an", "be"])

# Data Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters (punctuation, etc.)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text


In [5]:

# Apply preprocessing
news_data['text'] = news_data['title'] + ' ' + news_data['text']
news_data['text'] = news_data['text'].apply(preprocess_text)

# Split data into training and test sets
X = news_data['text']
y = news_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline with TF-IDF Vectorizer and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # Limit max features to prevent overfitting
    ('model', LogisticRegression())
])


In [6]:

# Train the model
pipeline.fit(X_train, y_train)

# Test the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Save the trained model
with open('news_classifier.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)


Model Accuracy: 98.88%
