In [1]:
!pip install tensorflow --upgrade

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/46/0f/7bd55361168bb32796b360ad15a25de6966c9c1beb58a8e30c01c8279862/tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (86.3MB)
[K     |████████████████████████████████| 86.3MB 135kB/s 
Collecting tensorflow-estimator<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fc/08/8b927337b7019c374719145d1dceba21a8bb909b93b1ad6f8fb7d22c1ca1/tensorflow_estimator-2.0.1-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 59.8MB/s 
Collecting tensorboard<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/d3/9e/a48cd34dd7b672ffc227b566f7d16d63c62c58b542d54efa45848c395dd4/tensorboard-2.0.1-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 44.7MB/s 
Collecting google-auth<2,>=1.6.3
[?25l  Downloading https://files.pythonhosted.org/packages/2f/81/d1e7d9974ba7c886f6d133a8baae18cb8d92b2d09bcc4f46328306825de0/google_auth-1.7.

In [19]:
from google.colab import drive
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, LSTM, Dropout, Embedding, Dense, Conv1D, MaxPooling1D
from tensorflow.keras.models import Model

import matplotlib.pyplot as plt
%matplotlib inline

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def load_tf_dataset(domains):
  vocab = sorted(set(''.join(domains['domain'].to_list())))
  char2idx = {u:i for i, u in enumerate(vocab)}
  idx2char = np.array(vocab)

  lines = []
  for i, line in enumerate(domains.iloc[:, 0]):
    lines.append([char2idx[c] for c in line])
  
  tensor = tf.keras.preprocessing.sequence.pad_sequences(lines, padding='post')
  targets = np.array(domains.iloc[:, 1], dtype=np.int32)

  data = tf.data.Dataset.from_tensor_slices(tensor)
  pred = tf.data.Dataset.from_tensor_slices(targets)
  dataset = tf.data.Dataset.zip((data, pred))
  
  return dataset, (char2idx, idx2char)

In [0]:
domains = pd.read_csv('/content/drive/My Drive/domain_data.csv')

In [5]:
domains[:1500000]

Unnamed: 0,domain,pred
0,rewbook.com,0
1,xbdqepictom.com,1
2,sostavproduktov.ru,0
3,wepuestnessbiophysicalohax.com,1
4,v-dslr,0
...,...,...
1499995,directoalpaladar.com.mx,0
1499996,sarafimartina,0
1499997,tvjxrsensinaix.com,1
1499998,masterofdestinies.com,0


We will use the first 1.5 million domains for training, and the remaining ones will be left for testing, inference, etc. The domains were already shuffled when the dataset was created, so we only need to batch them.

In [0]:
dataset, mappings = load_tf_dataset(domains[:1500000])
test_dataset, _ = load_tf_dataset(domains[1500000:1600000])
char2idx, idx2char = mappings

In [0]:
dataset = dataset.batch(1500, drop_remainder=True)

In [0]:
test_dataset = test_dataset.batch(1500, drop_remainder=True)

### Building the Model

Here, we build the model using Tensorflow's Keras API. It uses an architecture adapted from B. Yu *et al.*, 2018, which first embeds each character as a 128-dimensional vector, passes it through a 1D convolutional layer, runs it through an LSTM, and classifies it with a single dense layer.

The 82 in the input dimension comes from the fact that the maximum domain length in the training data is 82 characters, and the 39 comes from the 38 possible characters in a domain name.

We train the model for 20 epochs, using Adam.

In [0]:
def create_model():
  domain_input = Input(shape=(82,), dtype='int32', name='domain_input')
  embedding = Embedding(input_dim=39, output_dim=128, input_length=82, 
                        batch_input_shape=[1500, None])(domain_input)
  conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu', strides=1)(embedding)
  pool = MaxPooling1D(pool_size=2, padding='same')(conv)
  lstm = LSTM(64, return_sequences=False)(pool)
  drop = Dropout(0.5)(lstm)
  output = Dense(1, activation='sigmoid')(drop)
  model = tf.keras.Model(inputs=domain_input, outputs=output)
  return model

In [0]:
model = create_model()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
domain_input (InputLayer)    [(None, 82)]              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 82, 128)           4992      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 82, 128)           49280     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 41, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65  

In [34]:
EPOCHS = 6 
accuracy = []
losses = []

for i in range(EPOCHS):
  history = model.fit(dataset)

  accuracy.append(history.history['accuracy'])
  losses.append(history.history['loss'])

# Training was originally run for 8 epochs, but was stopped at only 6 epochs, due to time constraints.

      9/Unknown - 12s 1s/step - loss: 0.0329 - accuracy: 0.9900

KeyboardInterrupt: ignored

In [42]:
model.evaluate(test_dataset)



[0.0321862433162151, 0.98933333]

In [0]:
model.save('/content/drive/My Drive/domain_classifier_model.h5')