In [3]:

print("üîç Analyzing your dataset...")

data['label'] = data['Email Type'].map({'Safe Email': 0, 'Phishing Email': 1})

print("üìä Dataset info:")
print(f"Total emails: {len(data)}")
print(f"Safe Emails: {len(data[data['label'] == 0])}")
print(f"Phishing Emails: {len(data[data['label'] == 1])}")
print(f"Label distribution:\n{data['Email Type'].value_counts()}")

X_text = data['Email Text'].fillna('').astype(str)
y = data['label']

print(f"\n‚úÖ Prepared {len(X_text)} emails for training!")

üîç Analyzing your dataset...
üìä Dataset info:
Total emails: 18650
Safe Emails: 11322
Phishing Emails: 7328
Label distribution:
Email Type
Safe Email        11322
Phishing Email     7328
Name: count, dtype: int64

‚úÖ Prepared 18650 emails for training!


In [9]:
print("üõ†Ô∏è Building the ML model...")

vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
X = vectorizer.fit_transform(X_text)

print(f"üìä Feature matrix shape: {X.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"üìö Training set: {X_train.shape[0]} emails")
print(f"üß™ Testing set: {X_test.shape[0]} emails")

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("‚úÖ Model trained successfully!")

predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f" Model Accuracy: {accuracy:.2%}")
print("\n Detailed Performance:")
print(classification_report(y_test, predictions))

joblib.dump(model, 'phishing_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("üíæ Model saved for the next step!")

üõ†Ô∏è Building the ML model...
üìä Feature matrix shape: (18650, 1500)
üìö Training set: 14920 emails
üß™ Testing set: 3730 emails
‚úÖ Model trained successfully!
 Model Accuracy: 95.82%

 Detailed Performance:
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      2273
           1       0.93      0.96      0.95      1457

    accuracy                           0.96      3730
   macro avg       0.95      0.96      0.96      3730
weighted avg       0.96      0.96      0.96      3730

üíæ Model saved for the next step!


In [10]:

print(" Testing your model with sample emails...")
test_emails = [
    "Your account has been compromised. Click here to verify your identity: http://fake-bank-security.com",
    "Hi team, the meeting is moved to 3 PM tomorrow in conference room B.",
    "URGENT: Your PayPal account will be suspended. Verify now: http://paypal-security-update.com",
    "Thanks for your order #12345. Your package will arrive tomorrow."
]

print("Testing sample emails:")
for i, email in enumerate(test_emails, 1):
    features = vectorizer.transform([email])
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0]

    result = "PHISHING" if prediction == 1 else "SAFE"
    confidence = probability[1] if prediction == 1 else probability[0]

    print(f"Email {i}: {result} ({confidence:.2%})")
    print(f"   Text: {email[:80]}...")
    print()

 Testing your model with sample emails...
Testing sample emails:
Email 1: PHISHING (95.00%)
   Text: Your account has been compromised. Click here to verify your identity: http://fa...

Email 2: SAFE (95.00%)
   Text: Hi team, the meeting is moved to 3 PM tomorrow in conference room B....

Email 3: PHISHING (81.00%)
   Text: URGENT: Your PayPal account will be suspended. Verify now: http://paypal-securit...

Email 4: SAFE (51.00%)
   Text: Thanks for your order #12345. Your package will arrive tomorrow....



In [11]:
# FINAL SHOWCASE CELL
print("üéØ AI SECURITY ENGINEER PORTFOLIO PROJECT")
print("=" * 50)
print("Phishing Email Detection System")
print("‚Ä¢ 18,650 emails processed")
print("‚Ä¢ 95.82% accuracy achieved")
print("‚Ä¢ Random Forest + TF-IDF")
print("‚Ä¢ Production-ready model")
print("=" * 50)
print("Next: Malware Detection System ‚Üí")

üéØ AI SECURITY ENGINEER PORTFOLIO PROJECT
Phishing Email Detection System
‚Ä¢ 18,650 emails processed
‚Ä¢ 95.82% accuracy achieved
‚Ä¢ Random Forest + TF-IDF
‚Ä¢ Production-ready model
Next: Malware Detection System ‚Üí
