<a href="https://colab.research.google.com/github/seabisilas-alt/Github-Work/blob/main/rq1_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Uninstall and reinstall numpy, pandas, gensim and fasttext
!pip uninstall numpy pandas fasttext -y
!pip install numpy pandas==2.2.2 fasttext tensorflow

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModel
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow_hub as hub
import tensorflow as tf
import re
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, RepeatVector, Dense, Lambda, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.losses import sparse_categorical_crossentropy
from collections import Counter

In [None]:
Domains = pd.read_csv('/content/drive/MyDrive/MSC Work/RQ1/Dataset/All Domains.csv')

In [None]:
Domains

In [None]:
Legitimate_Domains_df = pd.read_csv('/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/Benign Domains.csv', header=None, names=['Legitimate Domain'])

In [None]:
Legitimate_Domains_df

In [None]:
# Lowercase
Legitimate_Domains_df['Legitimate Domain'] = Legitimate_Domains_df['Legitimate Domain'].str.lower()

# Split
train_df, test_df = train_test_split(Legitimate_Domains_df, test_size=0.2, random_state=42)



In [None]:
Illegitimate_Domains_df = pd.read_csv('/content/drive/MyDrive/MSC Work/RQ1/Dataset/Labelled Domains.csv')

In [None]:
Illegitimate_Domains_df

In [None]:
Combined = pd.concat([Illegitimate_Domains_df, Legitimate_Domains_df])

In [None]:
Combined['Domains'] = Combined['Domains'].fillna(Legitimate_Domains_df['Legitimate Domain'])

In [None]:
Combined['Label'] = Combined['Label'].replace({'Malicious':'__label__Malicious'})

In [None]:
Combined['Label'] = Combined['Label'].fillna('__label__Legit')

In [None]:
Combined

In [None]:
Combined['Domains'] = Combined['Domains'].str.lower()


In [None]:
Training_df, Testing_df = train_test_split(Combined[['Domains', 'Label']], test_size=0.2, random_state=42, stratify=Combined['Label'])

In [None]:
Training_df['Label'].value_counts()

## FastText Embeddings

In [None]:
Training_df['ft'] = Training_df['Label'] + ' ' + Training_df['Domains']

# Save to text files
Training_df['ft'].to_csv("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/fasttext_train.txt", index=False, header=False)

In [None]:
model = fasttext.train_supervised(
    input="/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/fasttext_train.txt",
    dim=100,
    minn=2, maxn=5,      # use character n-grams (good for domains)
    lr=0.1,
    epoch=30,
    label="__label__",
    verbose=2
)

In [None]:
model.save_model("/content/drive/MyDrive/MSC Work/RQ1/FastText_Model/domain_classifier[100].bin")

In [None]:
FastText_model = fasttext.load_model("/content/drive/MyDrive/MSC Work/RQ1/FastText_Model/domain_classifier[100].bin")

In [None]:
# Extract domains + labels from test set
test_domains = Testing_df['Domains'].tolist()
test_labels = Testing_df['Label'].tolist()

In [None]:
def get_FastText_emebddings(domains, model):
    embeddings = []
    for domain in domains:
        embedding = model.get_word_vector(domain)
        embeddings.append(embedding)
    return np.array(embeddings)

In [None]:
#Get embeddings (vector representations) from FastText
vectors_embeddings = get_FastText_emebddings(test_domains, FastText_model)

In [None]:
vectors_embeddings

In [None]:
# Reduce to 2D
pca = PCA(n_components=2)
PCA_Embeddings = pca.fit_transform(vectors_embeddings)


In [None]:
clean_labels = [l.replace("__label__", "") for l in test_labels]

In [None]:
reduced

In [None]:
# Plot
plt.figure(figsize=(10,7))

for label, color in [('Legit', 'green'), ('Malicious', 'red')]:
    idx = [i for i, l in enumerate(clean_labels) if l == label]
    plt.scatter(PCA_Embeddings[idx,0], PCA_Embeddings[idx,1], label=label, alpha=0.6, s=15, c=color)

plt.title("FastText Domain Embeddings (PCA 2D)")
plt.legend()
plt.show()

In [None]:
# Run t-SNE
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
reduced = tsne.fit_transform(vectors_embeddings)  # reduced.shape -> (n_samples, 2)

# Plot
plt.figure(figsize=(8,6))
for label, color in [('Legit', 'green'), ('Malicious', 'red')]:
    idx = [i for i, l in enumerate(clean_labels) if l == label]
    plt.scatter(reduced[idx,0], reduced[idx,1], label=label, alpha=0.6, s=20, c=color)

plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.title("t-SNE of Domain Embeddings")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
n_clusters = 2

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(vectors_embeddings)

In [None]:
cluster_labels

In [None]:
Testing_df

In [None]:
Testing_df['Label'] = Testing_df['Label'].replace({'__label__Malicious' : 1, '__label__Legit' : 0})

In [None]:
Testing_df['Cluster'] = cluster_labels

In [None]:
# Compute confusion matrix
cm = confusion_matrix(Testing_df['Label'], Testing_df['Cluster'])

# Create heatmap
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Legit", "Malicious"],
            yticklabels=Testing_df['Label'].unique())

plt.xlabel("Predicted Cluster")
plt.ylabel("True Label")
plt.title("Confusion Matrix: KMeans Clustering vs True Labels")
plt.show()



In [None]:
print(classification_report(Testing_df['Label'], Testing_df['Cluster']))

In [None]:
# Plot clusters
plt.figure(figsize=(10,7))
plt.scatter(reduced[:,0], reduced[:,1], c=cluster_labels, cmap="coolwarm", s=15, alpha=0.6)

# Optional: annotate with domain names (careful if you have too many)
# for i, domain in enumerate(test_domains[:100]):  # annotate first 100 only
#     plt.annotate(domain, (reduced[i,0], reduced[i,1]), fontsize=7, alpha=0.7)

plt.title("KMeans Clusters on FastText Domain Embeddings (PCA 2D)")
plt.colorbar(label="Cluster")
plt.show()

KMeans is perfectly precise for Legit domains → if it says a domain is Legit, you can trust it.

But it misses some Legit domains (lower recall).

For Malicious domains, it finds them all (recall 1.0), but sometimes wrongly pulls in Legit ones (precision < 1).

✅ Strong guarantee: If it predicts “Legit,” it’s really legit.

⚠️ But if it predicts “Malicious,” there’s a ~12% chance it’s wrong.

## ELMO Embeddings

In [None]:
Training_df['Domains'] = Training_df['Domains'].apply(lambda x: re.sub(r'^www\.', '', str(x).lower()))

In [None]:
X_train = Training_df["Domains"].astype(str).tolist()
y_train = Training_df["Label"].replace({"__label__Legit": 0, "__label__Malicious": 1}).values

In [None]:

def tokenize_domain(domain):
    # Split on non-alphanumeric characters and remove empty tokens
    tokens = re.split(r'[^a-zA-Z0-9]', str(domain))
    tokens = [t for t in tokens if t]
    return " ".join(tokens)


In [None]:
# Apply tokenization
Testing_df['Domain_tokens'] = Testing_df['Domains'].apply(tokenize_domain)

Test_domains  = Testing_df['Domain_tokens'].tolist()

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
def get_elmo_embeddings(sentences, batch_size=100): # Add batch_size parameter
    embeddings = []
    for i in range(0, len(sentences), batch_size): # Iterate in batches
        batch_sentences = sentences[i:i + batch_size]
        batch_embeddings = elmo.signatures['default'](tf.constant(batch_sentences))['elmo']
        avg_embeddings = tf.reduce_mean(batch_embeddings, axis=1).numpy()
        embeddings.append(avg_embeddings)
    return np.vstack(embeddings) # Stack the batch embeddings



In [None]:
X_test = get_elmo_embeddings(Test_domains, batch_size=512) # Process in batches of 512

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_test)


In [None]:
cm = confusion_matrix(Testing_df['Label'], clusters)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Legit','Malicious'],
            yticklabels=['Legit','Malicious'])
plt.xlabel("Predicted Cluster")
plt.ylabel("True Label")
plt.title("KMeans on Tokenized ELMo Domain Embeddings")
plt.show()

In [None]:
pca = PCA(n_components=2)
X_test_2d = pca.fit_transform(X_test)


In [None]:
# Plot
plt.figure(figsize=(10,7))

for label, color in [('Legit', 'green'), ('Malicious', 'red')]:
    idx = [i for i, l in enumerate(clean_labels) if l == label]
    plt.scatter(X_test_2d[idx,0], X_test_2d[idx,1], label=label, alpha=0.6, s=15, c=color)

plt.title("FastText Domain Embeddings (PCA 2D)")
plt.legend()
plt.show()

In [None]:

plt.figure(figsize=(10,7))
plt.scatter(X_test_2d[:,0], X_test_2d[:,1], c=clusters, cmap='summer', alpha=0.6, s=15)
plt.title("KMeans Clusters on Tokenized ELMo Domain Embeddings (PCA 2D)")
plt.colorbar(label="Cluster")
plt.show()


In [None]:
print(classification_report(Testing_df['Label'], clusters))

## Transformers (Bert) Embeddings

In [None]:
Train_df, Validation_df = train_test_split(Training_df[['Domains', 'Label']], test_size=0.2, random_state=42, stratify=Training_df['Label'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
class DomainDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=32)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# test_dataset = DomainDataset(test_domains, y_test)


In [None]:
def clean_domain(domain):
    domain = str(domain).lower().strip()              # lowercase & remove spaces
    domain = re.sub(r"^www\d*\.", "", domain)         # remove leading www / www2 etc.
    domain = re.sub(r"https?://", "", domain)         # remove http/https
    domain = re.sub(r"[^a-z0-9\.-]", "", domain)      # keep only alphanumeric, dot, dash
    domain = re.sub(r"\.{2,}", ".", domain)           # replace multiple dots with one
    return domain


In [None]:
Train_df["Domains"] = Train_df["Domains"].apply(clean_domain)
Train_df

In [None]:
Validation_df["Domains"] = Validation_df["Domains"].apply(clean_domain)
Validation_df

In [None]:
def tokenize_domain(domain):
    tokens = re.split(r'[^a-z0-9]', domain)   # split on non-alphanumeric
    tokens = [t for t in tokens if t]         # remove empty tokens
    return " ".join(tokens)

In [None]:
Train_df["Domains"] = Train_df["Domains"].apply(tokenize_domain)
Train_df

In [None]:
Validation_df["Domains"] = Validation_df["Domains"].apply(tokenize_domain)
Validation_df

In [None]:
label_map = {"__label__Legit": 0, "__label__Malicious":1}
Train_df["Label"] = Train_df["Label"].map(label_map)
Train_df

In [None]:
Validation_df["Label"] = Validation_df["Label"].map(label_map)
Validation_df

In [None]:
train_dataset = DomainDataset(Train_df["Domains"].tolist(), Train_df["Label"].tolist())

In [None]:
validation_dataset  = DomainDataset(Validation_df["Domains"].tolist(), Validation_df["Label"].tolist())

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Replace path with your saved checkpoint
MODEL_PATH = "./results/checkpoint-best"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
base_model = AutoModel.from_pretrained(MODEL_PATH)
model.eval()

In [None]:
def get_Bert_embeddings(texts, model):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=32, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings)
    # Mean pooling across tokens
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings


In [None]:
X_test = get_Bert_embeddings(Testing_df['Domains'].tolist(), base_model)
X_test

In [None]:
y_test = Testing_df['Label'].map(label_map).values
y_test

In [None]:
# KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_test)


In [None]:
# PCA 2D
pca = PCA(n_components=2)
PCA_Embeddings = pca.fit_transform(X_test)

In [None]:
plt.figure(figsize=(10,7))

for label, color in [('Legit', 'green'), ('Malicious', 'red')]:
    idx = [i for i, l in enumerate(clean_labels) if l == label]
    plt.scatter(PCA_Embeddings[idx,0], PCA_Embeddings[idx,1], label=label, alpha=0.6, s=15, c=color)

plt.title("FastText Domain Embeddings (PCA 2D)")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(PCA_Embeddings[:,0], PCA_Embeddings[:,1], c=clusters, cmap="coolwarm", alpha=0.6, s=15)
plt.title("Transformer Embeddings of Domains (PCA 2D)")
plt.show()

In [None]:
cm = confusion_matrix(y_test, clusters)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Legit","Malicious"], yticklabels=["Legit","Malicious"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

print(classification_report(y_test, clusters))


## Variational AutoEncoder

What makes VAEs special is their ability to model data probabilistically, which means they don’t just learn compressed versions of data but learn to generate new data points from the latent distribution. [Latent Space Representation](https://medium.com/@whyamit101/latent-space-representations-in-variational-autoencoders-vaes-e74076eda77b)

In [None]:
domains = Training_df["Domains"].astype(str).tolist()

In [None]:
def generate_ngrams(domain, n_sizes=[2,3,4]):
    domain = domain.lower().replace('.', '')
    ngrams = []
    for n in n_sizes:
        ngrams += [domain[i:i+n] for i in range(len(domain)-n+1)]
    return ngrams

In [None]:
all_ngrams = []
for domain in domains:
    all_ngrams += generate_ngrams(domain, n_sizes=[2,3,4])

ngram_counts = Counter(all_ngrams)
most_common_ngrams = [ngram for ngram, _ in ngram_counts.most_common(vocab_size)]
ngram2idx = {ng: i+1 for i, ng in enumerate(most_common_ngrams)}

In [None]:
def encode_domain_ngrams(domain, ngram2idx, n_sizes=[2,3,4], max_len=20):
    ngrams = generate_ngrams(domain, n_sizes)
    seq = [ngram2idx.get(ng, 0) for ng in ngrams]
    if len(seq) < max_len:
        seq += [0]*(max_len-len(seq))
    else:
        seq = seq[:max_len]
    return seq

In [None]:
encoded_domains = np.array([encode_domain_ngrams(d, ngram2idx, max_len=max_len) for d in domains])

In [None]:
max_len = 20
vocab_size = len()
embed_dim = 32
latent_dim = 64

In [None]:
encoded_domains

### -------------------------------
### Sampling layer and KL Divergence layer
### -------------------------------

In [None]:

class Sampling(Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        epsilon = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


class KLLossLayer(Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        kl = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
        kl_loss = -0.5 * tf.reduce_mean(tf.reduce_sum(kl, axis=1))
        self.add_loss(kl_loss)
        return inputs


### VAE Model

In [None]:
from tensorflow.keras import layers, Model, Input

In [None]:


# Encoder
inputs = Input(shape=(max_len,), name="encoder_input")
x = Embedding(input_dim=vocab_size+1, output_dim=embed_dim, input_length=max_len)(inputs)
x = Conv1D(64, 3, activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = MaxPooling1D(2, padding="same")(x)

x = Conv1D(32, 3, activation="relu", padding="same")(x)
x = MaxPooling1D(2, padding="same")(x)
x = layers.BatchNormalization()(x)

x = LSTM(64)(x)
x = layers.LayerNormalization()(x)

z_mean = Dense(latent_dim, name="z_mean")(x)
z_log_var = Dense(latent_dim, name="z_log_var")(x)

# KL Loss Layer
z_mean, z_log_var = KLLossLayer()([z_mean, z_log_var])

# Latent space sampling
z = Sampling()([z_mean, z_log_var])

# Decoder
x = RepeatVector(max_len)(z)
x = LSTM(32, return_sequences=True)(x)
x = layers.LayerNormalization()(x)

decoded = Conv1D(vocab_size+1, 3, activation="softmax", padding="same", name="decoder_output")(x)

vae = Model(inputs, decoded, name="vae")


In [None]:
def reconstruction_loss(y_true, y_pred):
    recon = sparse_categorical_crossentropy(y_true, y_pred)  # (batch, max_len)
    return tf.reduce_mean(tf.reduce_sum(recon, axis=1))

vae.compile(optimizer="adam", loss=reconstruction_loss)
vae.summary()


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Define checkpoint callback
checkpoint_cb = ModelCheckpoint(
    filepath="/content/drive/MyDrive/MSC Work/RQ1/FastText_Model/vae_best_model.keras",   # Save best model
    monitor="loss",                    # Monitor training loss (or 'val_loss' if using validation data)
    save_best_only=True,               # Only save when model improves
    save_weights_only=False,           # Save full model (architecture + weights + optimizer)
    verbose=1
)


In [None]:
# Train with checkpoint
history = vae.fit(
    encoded_domains, encoded_domains,
    epochs=20,
    batch_size=16,
    callbacks=[checkpoint_cb]
)

In [None]:
# Plot training loss
plt.plot(history.history['loss'], label='Training Loss')
if 'val_loss' in history.history:   # if you added validation data
    plt.plot(history.history['val_loss'], label='Validation Loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('VAE Training Loss')
plt.legend()
plt.show()


In [None]:
# Character vocabulary
chars = list(string.ascii_lowercase + string.digits + ".")


In [None]:
chars

In [None]:
chars = ['<PAD>', '<UNK>'] + chars  # include padding and unknown
print(chars)
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for c, i in char2idx.items()}

In [None]:
char2idx

In [None]:
idx2char

## Unsupervised

In [None]:
# Save as plain text (one domain per line)
train_df['Legitimate Domain'].to_csv("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains_train.txt", index=False, header=False)
test_df['Legitimate Domain'].to_csv("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains_test.txt", index=False, header=False)

In [None]:
model = fasttext.train_unsupervised(
    input="/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains_train.txt",
    model="skipgram",    # CBOW is also possible
    dim=100,             # embedding size
    minn=2, maxn=5       # use character n-grams (important for short text)
)

model.save_model("/content/drive/MyDrive/MSC Work/RQ1/FastText_Model/domain_embeddings.bin")

In [None]:
# Load test domains
with open("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains_test.txt") as f:
    test_domains = [line.strip() for line in f]

# Get embeddings
embeddings = np.array([model.get_word_vector(d) for d in test_domains])

In [None]:
# Lowercase
Illegitimate_Domains_df['Domains'] = Illegitimate_Domains_df['Domains'].str.lower()

# Split
train2_df, test2_df = train_test_split(Illegitimate_Domains_df, test_size=0.2, random_state=42)



In [None]:
# Save as plain text (one domain per line)
train2_df['Domains'].to_csv("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains2_train.txt", index=False, header=False)
test2_df['Domains'].to_csv("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains2_test.txt", index=False, header=False)

In [None]:
model = fasttext.train_unsupervised(
    input="/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains2_train.txt",
    model="skipgram",    # CBOW is also possible
    dim=100,             # embedding size
    minn=2, maxn=5       # use character n-grams (important for short text)
)

model.save_model("/content/drive/MyDrive/MSC Work/RQ1/FastText_Model/domain2_embeddings.bin")

In [None]:
# Load test domains
with open("/content/drive/MyDrive/MSC Work/RQ1/Dataset/Domains/domains2_test.txt") as f:
    test2_domains = [line.strip() for line in f]

# Get embeddings
embeddings2 = np.array([model.get_word_vector(d) for d in test2_domains])

In [None]:
# Reduce to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings2)

plt.figure(figsize=(10,7))
plt.scatter(reduced[:,0], reduced[:,1])

for i, domain in enumerate(test_domains):
    plt.annotate(domain, (reduced[i,0], reduced[i,1]), fontsize=8, alpha=0.7)

plt.title("FastText Domain Embeddings (PCA 2D)")
plt.show()

In [None]:


# Reduce to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)

plt.figure(figsize=(10,7))
plt.scatter(reduced[:,0], reduced[:,1])

for i, domain in enumerate(test_domains):
    plt.annotate(domain, (reduced[i,0], reduced[i,1]), fontsize=8, alpha=0.7)

plt.title("FastText Domain Embeddings (PCA 2D)")
plt.show()
