In [5]:
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk

# Download only stopwords (bypass punkt issues)
nltk.download('stopwords', quiet=True)

# Load data
real = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")
real['label'] = 0
fake['label'] = 1
df = pd.concat([real, fake]).sample(frac=1).reset_index(drop=True)

# Custom tokenizer without NLTK punkt
def custom_tokenizer(text):
    # Remove punctuation and split into words
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()
    # Manually filter stopwords
    stop_words = set(stopwords.words('english'))
    return [w for w in words if w not in stop_words and len(w) > 2]

# Preprocessing pipeline
df['clean_text'] = df['text'].apply(lambda x: ' '.join(custom_tokenizer(x)))

# Feature engineering
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model training completed successfully!")




Model training completed successfully!
