<a href="https://colab.research.google.com/github/ramatsemela8/Project2/blob/main/Phishing_and_Emails.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Download stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')  # <--- This is the missing line to download 'punkt'

# Step 2: Load the dataset
df = pd.read_csv('/content/Phishing_Email.csv')  # Replace with your dataset's file path

# Step 3: Handling missing values
# Check for missing values in the dataset
print(df.isnull().sum())

# Fill missing values in 'Email Text' with empty strings
df['Email Text'].fillna('', inplace=True)

# If there are any missing values in 'Email Type', drop those rows
df.dropna(subset=['Email Type'], inplace=True)

# Step 4: Text preprocessing
# Define stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess text: removing stopwords and applying stemming
def preprocess_text(text):
    # Tokenize words
    words = nltk.word_tokenize(text.lower())
    # Remove stopwords and apply stemming
    processed_words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(processed_words)

# Apply preprocessing to the 'Email Text' column
df['Processed Email Text'] = df['Email Text'].apply(preprocess_text)

# Step 5: Feature extraction using TF-IDF
# Split the dataset into features (X) and labels (y)
X = df['Processed Email Text']
y = df['Email Type']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TF-IDF Vectorizer to convert text into numeric form
tfidf = TfidfVectorizer()

# Step 6: Model building (Logistic Regression)
# Create a pipeline that includes TF-IDF vectorization and Logistic Regression
model = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))
])

# Train the model
model.fit(X_train, y_train)

# Step 7: Model Evaluation
# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64
Accuracy: 0.9670
                precision    recall  f1-score   support

Phishing Email       0.95      0.96      0.96      1457
    Safe Email       0.98      0.97      0.97      2273

      accuracy                           0.97      3730
     macro avg       0.96      0.97      0.97      3730
  weighted avg       0.97      0.97      0.97      3730



In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
 #Download stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')  # <--- This is the missing line to download 'punkt'

# Step 2: Load the dataset
df = pd.read_csv('/content/Phishing_Email.csv')  # Replace with your dataset's file path


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
 #Step 3: Handling missing values
# Check for missing values in the dataset
print(df.isnull().sum())

# Fill missing values in 'Email Text' with empty strings
df['Email Text'].fillna('', inplace=True)

# If there are any missing values in 'Email Type', drop those rows
df.dropna(subset=['Email Type'], inplace=True)


Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64


In [6]:
 #Step 4: Text preprocessing
# Define stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess text: removing stopwords and applying stemming
def preprocess_text(text):
    # Tokenize words
    words = nltk.word_tokenize(text.lower())
    # Remove stopwords and apply stemming
    processed_words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(processed_words)

# Apply preprocessing to the 'Email Text' column
df['Processed Email Text'] = df['Email Text'].apply(preprocess_text)

In [7]:
 #Step 5: Feature extraction using TF-IDF
# Split the dataset into features (X) and labels (y)
X = df['Processed Email Text']
y = df['Email Type']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TF-IDF Vectorizer to convert text into numeric form
tfidf = TfidfVectorizer()

In [8]:
# Step 6: Model building (Logistic Regression)
# Create a pipeline that includes TF-IDF vectorization and Logistic Regression
model = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))
])

# Train the model
model.fit(X_train, y_train)

In [9]:
# Step 7: Model Evaluation
# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.9670
                precision    recall  f1-score   support

Phishing Email       0.95      0.96      0.96      1457
    Safe Email       0.98      0.97      0.97      2273

      accuracy                           0.97      3730
     macro avg       0.96      0.97      0.97      3730
  weighted avg       0.97      0.97      0.97      3730

