# **Baseline: dummy classifier**
A description of the execution of the simplest baseline, model that always predicting the majority class

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score
from datasets import load_dataset

# Load the phishing dataset
dataset = load_dataset("ealvaradob/phishing-dataset", "texts", trust_remote_code=True)

# Check the structure of the dataset
print("Dataset Head:")
print(dataset['train'][:5])  # Print the first 5 entries of the dataset

# Extract text and labels
texts = [example['text'] for example in dataset['train']]
labels = [example['label'] for example in dataset['train']]

# Convert to DataFrame for better visualization and sanity check
df = pd.DataFrame({'text': texts, 'label': labels})
print("\nDataFrame Head:")
print(df.head())

# Split the data into training and testing sets
new_random_state = 21  # Change this value to create a new split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=new_random_state)

# Print heads of split datasets to ensure proper division
print("\nTraining Set Head:")
print(pd.DataFrame({'text': X_train[:5], 'label': y_train[:5]}))

print("\nTesting SetHead:")
print(pd.DataFrame({'text': X_test[:5], 'label': y_test[:5]}))

# Train a Dummy Classifier with 'most_frequent' strategy (predicts the most frequent class)
classifier = DummyClassifier(strategy="most_frequent")  # "most_frequent" will predict the most common label
classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test)

print("\nPredict Set Head:")
print(pd.DataFrame({'text': X_test[:5], 'label': y_pred[:5]}))

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for imbalanced datasets
recall = recall_score(y_test, y_pred, average='weighted')  # Use 'weighted' for imbalanced datasets

print("\nEvaluation Metrics with Dummy Classifier:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall Score: {recall:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model for future use
import joblib
joblib.dump(classifier, "dummy_classifier_no_bow.pkl")

```
Training Set Head:
                                                text  label
0  draft strawman as requested . i ' ll work on a...      0
1  re : unify brent , as i have indicated to both...      0
2  On Sat, 27 Jul 2002, Adam L. Beberg wrote:> On...      0
3  From: Matt Kettler > Hmm, I think that Marc, b...      0
4  midamerica ling conf 98 the schedule for the m...      0

Testing Set Head:
                                                text  label
0  online drugs - save up to 80 % online pharmacy...      1
1   if anyone calling from a mobile Co. and asks ...      0
2  status of hpl transfers to aep sally , please ...      0
3  she ' s not happy if your not king size 2 m 6 ...      1
4  Shit that is really shocking and scary, cant i...      0

Predict Set Head:
                                                text  label
0  online drugs - save up to 80 % online pharmacy...      0
1   if anyone calling from a mobile Co. and asks ...      0
2  status of hpl transfers to aep sally , please ...      0
3  she ' s not happy if your not king size 2 m 6 ...      0
4  Shit that is really shocking and scary, cant i...      0

Evaluation Metrics with Dummy Classifier:
Accuracy: 0.6130
F1 Score: 0.4659
Recall Score: 0.6130

Classification Report:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76      2469
           1       0.00      0.00      0.00      1559

    accuracy                           0.61      4028
   macro avg       0.31      0.50      0.38      4028
weighted avg       0.38      0.61      0.47      4028

```

