In [1]:
import json
import re
from urllib.parse import urlparse
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pickle
import matplotlib.pyplot as plt
import joblib

### All of the code to process the dataset can be found in `1-pup.ipynb`:

Step 1: Lexical features extraction: split URLs into components, apply a sliding window to the domain, and use a bag-of-words model to describe each component.

In [2]:
def lexicalFE(url): #lexical feature extraction fn - takes in a URL
    if not urlparse(url).scheme:
        url = 'http://' + url  # prepend with default scheme
    
    try:
        parsedURL = urlparse(url)
        domain = parsedURL.netloc
        path = parsedURL.path
        query = parsedURL.query
        domainNoPrefix = domain.replace('www.', '')
        features = {
            'domain': domain,
            'domainNoPrefix': domainNoPrefix,
            'domainLength': len(domain),
            'pathLength': len(path),
            'queryLength': len(query),
            'numPathComponents': len(path.split('/')) - 1,  # Subtracting 1 because the leading '/' results in an empty string at the start
            'numQueryComponents': len(query.split('&')) if query else 0,  # Only count if there's a query
            'hasDigitsInDomain': any(char.isdigit() for char in domain),
            'hasDigitsInPath': any(char.isdigit() for char in path),
            'hasDigitsInQuery': any(char.isdigit() for char in query)
        }
        return features
    except ValueError as e: #handle errors TODO: more here
        print(f"Error processing URL {url}: {e}")
        return {}

Step 2: descriptive features extraction - this function will further split the path component, remove common prefixes and TLDs, and calc stats

In [3]:
def descriptiveFE(url): #descriptive feature extraction fn - takes in a URL
    parsedURL = urlparse(url)
    domain = parsedURL.netloc.replace('www.', '')  # Remove common prefix
    path = parsedURL.path
    query = parsedURL.query
    path_components = path.split('/') # further split the path
    filename = path_components[-1] if '.' in path_components[-1] else None
    fileBool = 1 if filename else 0
    file_extension = filename.split('.')[-1] if filename else None
    
    # Calculate statistics
    features = {
        'domainLength': len(domain),
        'pathLength': len(path),
        'queryLength': len(query),
        'numPathComponents': len(path_components),
        'filename': filename,
        'fileNamePresent': fileBool,
        'fileExtension': file_extension,
        'isIpAdress': bool(re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain)),
        'fileExecutable': file_extension in ['exe', 'bin', 'bat']
    }
    return features

In [4]:
df = pd.read_csv('./datasets/conglom-labeled.csv', names=['URL', 'Classification'])

Apply lexical and descriptive feature extraction

In [5]:
df['Lexical_Features'] = df['URL'].apply(lambda x: lexicalFE(x))
df['Descriptive_Features'] = df['URL'].apply(lambda x: descriptiveFE(x))
testIndex = 12 # 12 this time
print(df.iloc[testIndex])

Error processing URL http://RybjUxÙãl5»7ÆE%ÝÔk+h|U+ýk©ìÉ½Æq]âF·õÁ¢w)ëA·ç°{t*m!¦2: Invalid IPv6 URL
Error processing URL http://ÆeF§÷%¶¿Õ½9¿b@Ö¸ÚZE¤ÒC¢ÄÅª2åç-]W³fU¤Jgkz.ø¿nJçåæuøD%@ðûÇùM¹uË: Invalid IPv6 URL
Error processing URL http://Ó6¸RTÃu~æÙg0>÷mÖiÓ=;XZ\%êýÜÉfn&\°%7õÉ"ieÖ1ÄÁêFÐò<$cï6t[0ò2"/Æa^2âpù/ýãÇ$E¬R«È²ú[Ì¶p¥qÒ°i°^ò[»³»]±9êdÓS¿Ë]ùþ5j¿·ªocÂplà7ÊÏJ§¢#3ðDCDõ²çÇGÝ.Vò=¿QB§Ä'`ÊáZÉê ÔîÆm®ÍÝQÓ(z;¹Áê¬âytÖÙ®ëNP²ÜEQ: Invalid IPv6 URL
Error processing URL http://µÔA¨!ÝÛ=]º£¦Pôwr72-ÕY5Äòè7¬-³]×)&¡e¸¢À6RD­NvY¨Ð«Ñ3Â¸%Qñ+ÛÈ¸$¶gz{þ: Invalid IPv6 URL
Error processing URL http://¨RÊÃûaCóÞit×ßÂe-DÖØ+9YèÌçÏ¯·"0£ÙÕ.0ößF«7¹NRÙ{ccÉÄãéçx[Ä6a5Ñ³LÖíÜÉÀ£Òma¥yRX*0ÅÝ7×ÊÁÌo«Õs¶0kdèÑ&Ä"Ï¨mZ'àDM×ñXÚÒK"päî±h¬cAÊeK@4r"^'ÓFþ1*ËË PÞô;õ$úàÑ@þ=êWÑ"Ãhñ®ç^«Ýó^çRúUJ.<6CyÜFØrÿV2ôæýZãiiIb;¨Ëµu^ÍVy)­è»âýº+SÖáÃì?å6åÔ/: Invalid IPv6 URL
Error processing URL ht

In [6]:
row_index = testIndex 
lexical_features_str = json.dumps(df.at[row_index, 'Lexical_Features'], indent=4)
descriptive_features_str = json.dumps(df.at[row_index, 'Descriptive_Features'], indent=4)
print(f"Lexical Features for row {row_index}:\n{lexical_features_str}\n")
print(f"Descriptive Features for row {row_index}:\n{descriptive_features_str}\n")

Lexical Features for row 12:
{
    "domain": "0265331.com",
    "domainNoPrefix": "0265331.com",
    "domainLength": 11,
    "pathLength": 0,
    "queryLength": 0,
    "numPathComponents": 0,
    "numQueryComponents": 0,
    "hasDigitsInDomain": true,
    "hasDigitsInPath": false,
    "hasDigitsInQuery": false
}

Descriptive Features for row 12:
{
    "domainLength": 0,
    "pathLength": 11,
    "queryLength": 0,
    "numPathComponents": 1,
    "filename": "0265331.com",
    "fileNamePresent": 1,
    "fileExtension": "com",
    "isIpAdress": false,
    "fileExecutable": false
}



Now, we need to normalize the features and concatenate them with the original dataframe

In [7]:
chunk_size = 5000 

In [8]:
# instantiate empty dataframe
df_final = pd.DataFrame()

# process in chunk size defined in previous cell
for start in range(0, df.shape[0], chunk_size):
    end = min(start + chunk_size, df.shape[0])
    df_chunk = df.iloc[start:end].copy()
    df_chunk.reset_index(drop=True, inplace=True)
    # normalize lexical and descriptive features
    lexFeatsDF = pd.json_normalize(df_chunk['Lexical_Features'])
    lexFeatsDF.columns = ['Lexical_' + str(col) for col in lexFeatsDF.columns]
    descFeatsDF = pd.json_normalize(df_chunk['Descriptive_Features'])
    descFeatsDF.columns = ['Descriptive_' + str(col) for col in descFeatsDF.columns]
    df_chunk = pd.concat([df_chunk, lexFeatsDF, descFeatsDF], axis=1) #concat normalized feats with chunk
    df_final = pd.concat([df_final, df_chunk], axis=0, ignore_index=True) #direct append to final df

# drop unnecessary column names
df_final.drop(['Lexical_Features', 'Descriptive_Features'], axis=1, inplace=True)

In [9]:
df_final = df_final.drop(1) # drop row 1
df_final = df_final.drop(['URL'], axis=1)
df_final_columns = df_final.columns.tolist()
with open('model_columns.txt', 'w') as f:
    f.write('\n'.join(df_final_columns))
df_final = df_final.dropna() #drop rows with missing values
catCols = df_final.select_dtypes(include=['object', 'category']).columns
# convert categoricals
for col in catCols:
    # skip the target column 'Classification'
    if col == 'Classification':
        continue
    le = LabelEncoder()
    df_final[col] = le.fit_transform(df_final[col])
    # save the encoder
    joblib.dump(le, './models-checkpoints/categorical_feature_encoder.joblib')

# split dataframe into features and target
X = df_final.drop('Classification', axis=1)
y = df_final['Classification']

# convert 'Classification' to numerical vals
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train_CNN = X_train # capturing this pre-scaling
X_test_CNN = X_test
y_train_CNN = y_train
y_test_CNN = y_test

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = X_train_scaled
X_test = X_test_scaled

## 5. BERT

In [16]:
df.head()

Unnamed: 0,URL,Classification,Lexical_Features,Descriptive_Features
0,URL,type,"{'domain': 'URL', 'domainNoPrefix': 'URL', 'do...","{'domainLength': 0, 'pathLength': 3, 'queryLen..."
1,0008d6ba2e.com,ads,"{'domain': '0008d6ba2e.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."
2,0024ad98dd.com,ads,"{'domain': '0024ad98dd.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."
3,0083334e84.com,ads,"{'domain': '0083334e84.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."
4,00d3ed994e.com,ads,"{'domain': '00d3ed994e.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."


In [17]:
from sklearn.preprocessing import LabelEncoder

# Creating a new DataFrame
df_new = df.copy()

# Initializing the Label Encoder
label_encoder = LabelEncoder()

# Transforming the 'Classification' column to numerical
df_new['Classification'] = label_encoder.fit_transform(df_new['Classification'])

# Displaying the first few rows of the new DataFrame
df_new.head()


Unnamed: 0,URL,Classification,Lexical_Features,Descriptive_Features
0,URL,6,"{'domain': 'URL', 'domainNoPrefix': 'URL', 'do...","{'domainLength': 0, 'pathLength': 3, 'queryLen..."
1,0008d6ba2e.com,0,"{'domain': '0008d6ba2e.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."
2,0024ad98dd.com,0,"{'domain': '0024ad98dd.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."
3,0083334e84.com,0,"{'domain': '0083334e84.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."
4,00d3ed994e.com,0,"{'domain': '00d3ed994e.com', 'domainNoPrefix':...","{'domainLength': 0, 'pathLength': 14, 'queryLe..."


In [18]:
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# urls = list(df['URL'])
# encoding = tokenizer(urls, padding=True, truncation=True, return_tensors="pt")

In [19]:
# # Assuming `encoding` is your BatchEncoding object from the tokenizer
# with open('./models-checkpoints/bert-url-encodings.pkl', 'wb') as f:
#     pickle.dump(encoding, f)

In [20]:
with open('./models-checkpoints/bert-url-encodings.pkl', 'rb') as f:
    encoding = pickle.load(f)

In [21]:
labels_list = df_new['Classification'].tolist()
print(labels_list[0:5])

[6, 0, 0, 0, 0]


In [22]:
labels_tensor = torch.tensor(labels_list)

In [23]:
dataset = TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels_tensor)

In [24]:
# Split the dataset into 90% training and 10% validation
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

NameError: name 'random_split' is not defined

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
from torch.utils.data import DataLoader, random_split

# Assuming `dataset` is your original dataset
total_size = len(dataset)
small_size = int(0.01 * total_size)  # 1% - For example, 0.1 for 10% of the dataset

# Split your dataset into a smaller dataset and a remainder (which we won't use here)
small_dataset, _ = random_split(dataset, [small_size, total_size - small_size])

# Create a DataLoader for the smaller dataset
small_train_loader = DataLoader(small_dataset, batch_size=8, shuffle=True)

In [None]:
from transformers import BertForSequenceClassification

num_labels = 6

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 4  # Number of training epochs. BERT authors recommend 2, 3, or 4.
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)



# TEST RUN w/ 1 epoch, 10% of dataset via small_train_loader

In [None]:
import torch
import time

if torch.cuda.is_available():
    print("Using CUDA.")
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    print("Using MPS.")
    device = torch.device("mps")
else:
    print("Using CPU.")
    device = torch.device("cpu")

# device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model.to("mps")

# Use a smaller number of epochs for the test run
epochs = 1

start_time = time.time()

for epoch_i in range(epochs):
    model.train()
    for step, batch in enumerate(small_train_loader):  # Use the smaller loader
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Test run took {elapsed_time:.2f} seconds.")

Using MPS.


KeyboardInterrupt: 

In [None]:
import torch
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch_i in range(0, epochs): # TRAINING
    start_time = time.time()

    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_loader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Run took {elapsed_time:.2f} seconds.")

    avg_train_loss = total_train_loss / len(train_loader)            
    print(f"Average training loss: {avg_train_loss}")

    # EVALUATION
    
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in val_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask,
                            labels=b_labels)
            
        loss = outputs.loss
        total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(val_loader)
    print(f"Validation loss: {avg_val_loss}")

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), './models-checkpoints/bert/bert-url-classifier-state.pth')
torch.save(model, './models-checkpoints/bert/bert-url-classifier-full-model.pth')

In [None]:
model = torch.load('./models-checkpoints/bert/bert-url-classifier-full-model.pth')
model.eval()

In [1]:
# TODO: more here