In [None]:
# Install libraries if not already installed
!pip install pandas numpy matplotlib seaborn scikit-learn


In [None]:
import pandas as pd
import zipfile
import os

# Define the path to the zip file and the extraction directory
zip_path = '/content/archive (2).zip'
extract_path = '/content/'

# Ensure the extraction directory exists
os.makedirs(extract_path, exist_ok=True)

# Extract the files from the zip archive
try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
except zipfile.BadZipFile:
    print(f"Error: The file at {zip_path} is not a valid zip file or is corrupted.")
    print("Please re-upload the zip file and try again.")
    # Exit or handle the error as needed, e.g., by not proceeding with loading data
except FileNotFoundError:
    print(f"Error: The file at {zip_path} was not found.")
    print("Please ensure the file is uploaded to the correct location.")
    # Exit or handle the error as needed
else:
    # Load the datasets from the extracted files
    fake = pd.read_csv(os.path.join(extract_path, 'Fake.csv'))
    true = pd.read_csv(os.path.join(extract_path, 'True.csv'))

    # Add labels: 0 for Fake, 1 for True
    fake['label'] = 0
    true['label'] = 1

    # Combine both datasets
    df = pd.concat([fake, true], ignore_index=True)

    # Drop irrelevant columns
    df = df.drop(['subject', 'date'], axis=1)

    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Check structure
    print("Dataset shape:", df.shape)
    print(df.head())

In [None]:
import zipfile

with zipfile.ZipFile('/content/archive (2).zip', 'r') as zip_ref:
    print(zip_ref.namelist())

In [None]:
import string
import re

# Function to clean text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\[.*?\]', '', text)  # remove text in brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove links
    text = re.sub(r'<.*?>+', '', text)  # remove html tags
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # remove punctuation
    text = re.sub(r'\n', ' ', text)  # remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # remove words with numbers
    return text

# Apply cleaning to the 'text' column
df['text'] = df['text'].apply(clean_text)

# View cleaned sample
df['text'].head()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Target variable
y = df['label']


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Initialize and train the model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)

# Evaluation
print("🔷 Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred_nb = nb_model.predict(X_test)

# Evaluation
print("\n🔷 Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb))
print("Recall:", recall_score(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot Confusion Matrix for Naive Bayes
cm = confusion_matrix(y_test, y_pred_nb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Naive Bayes - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
!ls /content/

In [None]:
import numpy as np

# Get feature names from TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get the coefficients from the Logistic Regression model
coefficients = lr_model.coef_[0]

# Create a DataFrame with words and their coefficients
coef_df = pd.DataFrame({
    'word': feature_names,
    'coefficient': coefficients
})


In [None]:
# Top 10 words for predicting FAKE news (negative coefficients)
top_fake = coef_df.sort_values(by='coefficient').head(10)

# Top 10 words for predicting REAL news (positive coefficients)
top_real = coef_df.sort_values(by='coefficient', ascending=False).head(10)

print("\n🔴 Top words predicting FAKE news:")
print(top_fake)

print("\n🟢 Top words predicting REAL news:")
print(top_real)


In [None]:
import matplotlib.pyplot as plt

# Fake
plt.figure(figsize=(10,5))
plt.barh(top_fake['word'], top_fake['coefficient'], color='red')
plt.title('Top 10 Words Indicative of Fake News')
plt.xlabel('Coefficient')
plt.gca().invert_yaxis()
plt.show()

# Real
plt.figure(figsize=(10,5))
plt.barh(top_real['word'], top_real['coefficient'], color='green')
plt.title('Top 10 Words Indicative of Real News')
plt.xlabel('Coefficient')
plt.gca().invert_yaxis()
plt.show()
