In [None]:
# training/train_phishing_model.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import tensorflowjs as tfjs

# 1. Load the datasets
phishing_df = pd.read_csv('data/Phishing URLs.csv')  # Replace with your actual file path
legitimate_df = pd.read_csv('data/Legitimate URLs.csv')  # Replace with your actual file path

# 2. Combine the datasets
phishing_df['label'] = 'phishing'
legitimate_df['label'] = 'legitimate'
data = pd.concat([phishing_df, legitimate_df], ignore_index=True)

# 3. Feature Extraction

def extract_features(url):
    features = []
    
    # 1. Length of URL
    features.append(len(url))
    
    # 2. Number of dots
    features.append(url.count('.'))
    
    # 3. Presence of HTTPS
    features.append(int('https' in url))
    
    # 4. Presence of '//' in the URL
    features.append(int('//' in url[7:]))  # Skip the protocol part
    
    # 5. Length of domain
    domain = url.split('//')[-1].split('/')[0]
    features.append(len(domain))
    
    # 6. Presence of IP address
    features.append(int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', domain))))
    
    # 7. Number of subdomains
    features.append(len(domain.split('.')) - 1)
    
    # 8. Presence of sensitive keywords
    sensitive_keywords = ['login', 'admin', 'account', 'update', 'service', 'payment', 'secure']
    features.append(int(any(keyword in url.lower() for keyword in sensitive_keywords)))
    
    return features

import re

# Apply feature extraction
data['features'] = data['url'].apply(extract_features)

# Convert features to DataFrame
features_df = pd.DataFrame(data['features'].tolist(), columns=['url_length', 'num_dots', 'has_https', 'has_double_slash', 'domain_length', 'is_ip', 'num_subdomains', 'has_sensitive_keywords'])

# Combine features with labels
final_data = pd.concat([features_df, data['label']], axis=1)

# 4. Encode Labels
le = LabelEncoder()
final_data['label_encoded'] = le.fit_transform(final_data['label'])

# 5. Prepare Features and Labels
X = final_data.drop(['label', 'label_encoded'], axis=1).values
y = final_data['label_encoded'].values

# 6. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 7. Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 8. Build the Model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# 9. Compile the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 10. Train the Model
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=50, batch_size=1024, validation_split=0.2, callbacks=[early_stop])

# 11. Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# 12. Save the Model
model.save('training/model.h5')

# 13. Convert the Model to TensorFlow.js Format
tfjs.converters.save_keras_model(model, 'training/model')

# 14. Save the Scaler
import joblib
joblib.dump(scaler, 'training/scaler.save')