In [8]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import csv  # Required for quoting option

# Download stopwords if not present
nltk.download('stopwords')
#upload
fake = pd.read_csv('Fake.csv', on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf-8')
true = pd.read_csv('True.csv', on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf-8')

# Print shapes to confirm loading
print("Fake News Shape:", fake.shape)
print("True News Shape:", true.shape)


# Combine the datasets
fake['label'] = 0
true['label'] = 1
data = pd.concat([fake, true], axis=0).reset_index(drop=True)

# Fill missing values in 'title' or 'text' with a placeholder
data['title'] = data['title'].fillna('')
data['text'] = data['text'].fillna('')


# Combine title and text columns
data['text'] = data['title'] + " " + data['text']

# Ensure 'text' is string type
data['text'] = data['text'].astype(str)

# Preprocessing function
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
data['text'] = data['text'].apply(clean_text)

# Remove rows where 'text' is empty after preprocessing
data = data[data['text'] != '']
print(f"\nShape after removing empty text rows: {data.shape}")


# Define X and y
X = data['text']
y = data['label']

# Ready for vectorization and modeling


# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fake News Shape: (1529, 4)
True News Shape: (1575, 4)

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.91      0.03      0.06       324
           1       0.49      1.00      0.65       297

    accuracy                           0.49       621
   macro avg       0.70      0.51      0.36       621
weighted avg       0.71      0.49      0.34       621

Accuracy: 0.49
