# **Linear Regression: TF-IDF (Term Frequency-Inverse Document Frequency)**
A description of an execution of linear regression / logistic regression

TF-IDF weighs words based on their frequency in a document and their rarity across all documents. This helps the model prioritize important, unique words, improving phishing detection accuracy.

In [None]:
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=150000)
X_tfidf = vectorizer.fit_transform(df['text']).toarray()

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=42)

# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Step 3: Define Logistic Regression Model in PyTorch
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)  # Linear layer with one output for binary classification
        self.sigmoid = nn.Sigmoid()            # Apply sigmoid for probabilities

    def forward(self, x):
        return self.sigmoid(self.linear(x))

# Initialize Model
input_size = X_train_tensor.shape[1]
model = LogisticRegressionModel(input_size)

# Define Loss and Optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 4: Train the Model
epochs = 10
batch_size = 64
for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(X_train_tensor.size(0))

    for i in range(0, X_train_tensor.size(0), batch_size):
        optimizer.zero_grad()

        indices = permutation[i:i + batch_size]
        batch_x, batch_y = X_train_tensor[indices], y_train_tensor[indices]

        # Forward Pass
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)

        # Backward Pass and Optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

# Step 5: Evaluate the Model
model.eval()
with torch.no_grad():
    y_pred_probs = model(X_test_tensor)
    y_pred = (y_pred_probs > 0.5).int()

# Print Metrics
accuracy = accuracy_score(y_test_tensor, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tensor, y_pred, target_names=["Legitimate", "Phishing"]))


```
DataFrame Head:
                                                text  label
0  re : 6 . 1100 , disc : uniformitarianism , re ...      0
1  the other side of * galicismos * * galicismo *...      0
2  re : equistar deal tickets are you still avail...      0
3  \nHello I am your hot lil horny toy.\n    I am...      1
4  software at incredibly low prices ( 86 % lower...      1

Accuracy: 0.9553

Classification Report:

              precision    recall  f1-score   support

  Legitimate       0.94      0.99      0.96      2493
    Phishing       0.98      0.90      0.94      1535

    accuracy                           0.96      4028
   macro avg       0.96      0.95      0.95      4028
weighted avg       0.96      0.96      0.95      4028

```

