In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import string
import pickle
import nltk
import lightgbm as lgb
import json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load datasets
fake = pd.read_csv(r"c:\Users\ROBIN\Desktop\FAKE.csv")
real = pd.read_csv(r"c:\Users\ROBIN\Desktop\REAL.csv")

# Add labels and combine datasets
fake['target'] = 'fake'
real['target'] = 'real'
data = pd.concat([fake, real]).reset_index(drop=True).sample(frac=1, random_state=42)

# Drop unnecessary columns
data.drop(["date", "title"], axis=1, inplace=True)

# Text preprocessing function
def clean_text(text):
    if pd.isna(text):  
        return ''
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    return ' '.join(tokens)

# Apply text preprocessing
data['cleaned_text'] = data['text'].fillna('').apply(clean_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'], data['target'], test_size=0.2, stratify=data['target'], random_state=42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, tokenizer=word_tokenize, token_pattern=None)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Label Encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Dictionary to store models and accuracy
models = {}
accuracies = {}

# LightGBM Model
params = {
    'objective': 'binary', 'boosting_type': 'gbdt', 'metric': ['binary_error', 'auc'],
    'num_leaves': 63, 'learning_rate': 0.05, 'feature_fraction': 0.8,
    'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'class_weight': 'balanced'
}
train_data = lgb.Dataset(X_train_tfidf, label=y_train_encoded)
test_data = lgb.Dataset(X_test_tfidf, label=y_test_encoded, reference=train_data)
models['LightGBM'] = lgb.train(
    params, train_data, num_boost_round=1000, valid_sets=[train_data, test_data],
    valid_names=['train', 'test'], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
)
y_pred = (models['LightGBM'].predict(X_test_tfidf, num_iteration=models['LightGBM'].best_iteration) >= 0.5).astype(int)
accuracies['LightGBM'] = accuracy_score(y_test_encoded, y_pred)

# Define other models
model_definitions = {
    'Random Forest': RandomForestClassifier(n_estimators=50, criterion="entropy"),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    'SVM': svm.SVC(kernel='linear', probability=True)
}

# Train and evaluate other models
for model_name, model in model_definitions.items():
    model.fit(X_train_tfidf, y_train_encoded)
    y_pred = model.predict(X_test_tfidf)
    accuracies[model_name] = accuracy_score(y_test_encoded, y_pred)
    models[model_name] = model

# Save models, vectorizer, and label encoder
for model_name, model in models.items():
    with open(f"{model_name.replace(' ', '_').lower()}_model.pkl", 'wb') as f:
        pickle.dump(model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save accuracies for frontend display
with open('model_accuracies.json', 'w') as f:
    json.dump(accuracies, f)

print("All models and vectorizer saved successfully.")
print("\nModel Accuracies:")
for model, acc in accuracies.items():
    print(f"{model}: {acc:.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROBIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ROBIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training until validation scores don't improve for 50 rounds
[50]	train's binary_error: 0.00360036	train's auc: 0.999865	test's binary_error: 0.0042	test's auc: 0.999724
Early stopping, best iteration is:
[45]	train's binary_error: 0.00365037	train's auc: 0.999845	test's binary_error: 0.0038	test's auc: 0.999753
All models and vectorizer saved successfully.

Model Accuracies:
LightGBM: 0.9962
Random Forest: 0.9940
Logistic Regression: 0.9822
Naive Bayes: 0.9306
Decision Tree: 0.9948
K-Nearest Neighbors: 0.7154
AdaBoost: 0.9930
SVM: 0.9910
