<a href="https://colab.research.google.com/github/sagarparmar072000-bit/sagar/blob/main/Copy_of_Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, GlobalAveragePooling1D
import warnings

warnings.filterwarnings('ignore')

# --- Data Loading (Placeholder) ---
# NOTE: Replace this with your actual data loading code for the Enron-Phish dataset.
# Example: df = pd.read_csv('your_enron_phish_dataset.csv')
# For demonstration, we will create a simple mock dataset.
print("Loading dataset...")
np.random.seed(42)
data = {
    'text': [
        "Congratulations! You've won a free prize. Click here now!",
        "Phishing scam alert: Do not click this link.",
        "Your account has been suspended. Click this link to verify.",
        "Meeting at 2pm tomorrow. Please see the attached agenda.",
        "Please update your payment information to avoid service interruption.",
        "Regarding the project report for next week, let me know your thoughts.",
        "Please login to your bank account via the link below.",
        "Hello, just checking in. Hope you are well."
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0] # 1 for phishing, 0 for legitimate
}
df = pd.DataFrame(data)
print("Dataset loaded successfully.")

# --- Pre-processing ---
def preprocess_text(text):
    """Cleans and preprocesses text data."""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

print("Pre-processing text data...")
df['text'] = df['text'].apply(preprocess_text)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000, oov_token="<unk>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
max_len = max(len(s) for s in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.3, random_state=42)

# --- Model Building: LSTM with Attention ---
print("Building LSTM with Attention model...")
vocab_size = 5000
embedding_dim = 128
lstm_units = 64

# Define the input layer
input_layer = Input(shape=(max_len,))

# Embedding layer
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

# LSTM layer with return_sequences=True
lstm_out = LSTM(lstm_units, return_sequences=True)(embedding_layer)

# Attention layer - applied to the LSTM output
attention_output = Attention()([lstm_out, lstm_out])

# GlobalAveragePooling1D to get a single output per sequence after attention
pooled_output = GlobalAveragePooling1D()(attention_output)

# Output layer
output_layer = Dense(1, activation='sigmoid')(pooled_output)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# --- Model Training ---
print("Training the model...")
model.fit(X_train, y_train, epochs=10, batch_size=2, verbose=0)
print("Model training complete.")

# --- Evaluation ---
print("Evaluating model performance on the test set...")
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32").flatten()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Print results
print("\n--- Model Performance Metrics ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Loading dataset...
Dataset loaded successfully.
Pre-processing text data...
Building LSTM with Attention model...
Training the model...
Model training complete.
Evaluating model performance on the test set...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step

--- Model Performance Metrics ---
Accuracy: 0.3333
Precision: 0.1111
Recall: 0.3333
F1-Score: 0.1667
