In [36]:
pip install bert-extractive-summarizer

Note: you may need to restart the kernel to use updated packages.


### Load from a CSV file

### Import Necessary Libraries

In [37]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score
import torch


In [38]:
import pandas as pd
# Load the cleaned dataset from CSV
cleaned_dataset_path = 'cleaned_spam.csv'
df = pd.read_csv(cleaned_dataset_path)
print("Cleaned dataset loaded")


Cleaned dataset loaded


In [39]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,text_length,cleaned_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,327,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,97,subject hpl nom january see attached file hpln...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,2524,subject neon retreat ho ho ho around wonderful...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,414,subject photoshop windows office cheap main tr...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,336,subject indian springs deal book teco pvr reve...


In [40]:
from summarizer import Summarizer

# Initialize the BERT summarizer
bert_summarizer = Summarizer()

# Function to summarize text while handling potential errors
def summarize_text(text):
    try:
        # Check if text is string and not empty
        if isinstance(text, str) and text.strip():
            return bert_summarizer(text, num_sentences=2)  # Adjust number of sentences as needed
        return text
    except Exception:
        return text

# Apply summarization to clean_text column
df['summarized_text'] = df['cleaned_text'].apply(summarize_text)


In [41]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,text_length,cleaned_text,summarized_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,327,subject enron methanol meter follow note gave ...,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,97,subject hpl nom january see attached file hpln...,subject hpl nom january see attached file hpln...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,2524,subject neon retreat ho ho ho around wonderful...,
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,414,subject photoshop windows office cheap main tr...,subject photoshop windows office cheap main tr...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,336,subject indian springs deal book teco pvr reve...,subject indian springs deal book teco pvr reve...


In [42]:
print(df.loc[5, 'summarized_text'])

subject ehronline web address change message intended ehronline users due recent change ehronline url aka web address accessing ehronline needs changed computer change involves adding letter http reference url url accessing ehronline https ehronline enron com change made added url favorite browser


In [64]:
# Check for None or missing values in 'label' or 'predicted_label'
print("Null values in 'label':", df['label'].isnull().sum())
print("Null values in 'predicted_label':", df['predicted_label'].isnull().sum())

# Check rows with None or invalid predicted_label
invalid_rows = df[df['predicted_label'].isnull() | (df['predicted_label'] == "")]
print("Invalid predicted_label rows:\n", invalid_rows)


Null values in 'label': 0
Null values in 'predicted_label': 1699
Invalid predicted_label rows:
       Unnamed: 0 label                                               text  \
2           3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
6           2793   ham  Subject: spring savings certificate - take 30 ...   
10          4922  spam  Subject: vocable % rnd - word asceticism\r\nvc...   
15          4791  spam  Subject: underpriced issue with high return on...   
16          2643   ham  Subject: re : first delivery - wheeler operati...   
...          ...   ...                                                ...   
5145        3348   ham  Subject: re : epgt\r\ngloria , the difference ...   
5154        2203   ham  Subject: re : hpl meter # 980074 bammel hpl d ...   
5161        4979  spam  Subject: penny stocks are about timing\r\nnoma...   
5163        1428   ham  Subject: re : meter # : 1266 ; august 2000 / a...   
5170        4807  spam  Subject: important online banking

In [65]:
# Replace None or missing values in 'predicted_label' with "ham"
df['predicted_label'] = df['predicted_label'].fillna("ham")


In [66]:
# Remove rows with missing or invalid 'predicted_label'
df = df[df['predicted_label'].notnull() & (df['predicted_label'] != "")]


In [67]:
# Confirm no more invalid values
print("Unique values in 'label':", df['label'].unique())
print("Unique values in 'predicted_label':", df['predicted_label'].unique())


Unique values in 'label': ['ham' 'spam']
Unique values in 'predicted_label': ['ham' 'spam']


### Initiliaze a zeroshot pipeline

In [43]:
from transformers import pipeline

# Load Zero-Shot Classification pipeline
zsl_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define candidate labels
candidate_labels = ["spam", "ham"]


Device set to use cuda:0


### Apply Zero-SHot Learning to the dataset

In [68]:
def classify_text_with_threshold(text):
    try:
        if isinstance(text, str) and text.strip():
            # Perform zero-shot classification
            result = zsl_pipeline(text, candidate_labels=["spam", "ham"])
            # Check confidence scores
            scores = result['scores']
            labels = result['labels']
            # Ensure the highest confidence score is above a threshold (e.g., 0.5)
            if scores[0] > 0.5:
                return labels[0]
            else:
                return "ham"  # Default to 'ham' if below threshold
        return "ham"  # Default to 'ham' for invalid or empty text
    except Exception as e:
        print(f"Error processing text: {e}")
        return "ham"  # Default fallback for errors


### Evaluate the Model

In [69]:
# Apply the classification to the summarized_text column
df['predicted_label'] = df['summarized_text'].apply(classify_text_with_threshold)

In [70]:
# Inspect unique values in the columns
print("Unique values in 'label':", df['label'].unique())
print("Unique values in 'predicted_label':", df['predicted_label'].unique())

Unique values in 'label': ['ham' 'spam']
Unique values in 'predicted_label': ['ham' 'spam']


In [71]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate accuracy
accuracy = accuracy_score(df['label'], df['predicted_label'])
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report
report = classification_report(df['label'], df['predicted_label'], target_names=["ham", "spam"])
print("Classification Report:\n", report)


Accuracy: 0.72
Classification Report:
               precision    recall  f1-score   support

         ham       0.73      0.96      0.83      3672
        spam       0.56      0.13      0.21      1499

    accuracy                           0.72      5171
   macro avg       0.65      0.54      0.52      5171
weighted avg       0.68      0.72      0.65      5171

