In [19]:
import json
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [20]:
nl_f = open('../data/nl.json')
en_f = open('../data/en.json')
nl_data = json.load(nl_f)
en_data = json.load(en_f)

In [21]:
urls_tokenized = []
excluded_keywords = ['www', 'index', 'html', 'htm', 'html', 'http', 'https']
nl_tlds = ['.nl/', '.be/', '.su/', '.aw/', '.sx/', '.cw/']
other_tlds = ['.com/', '.net/', '.org/']

for element in nl_data[0:10000]:
    url_element = element['siteUrl'] + ('' if element['siteUrl'].endswith('/') else '/')
    split = [word for word in re.split('[^a-zA-Z]', url_element) 
        if len(word) >= 2 and word not in excluded_keywords]
    split_sentence = ' '.join(split)
    custom_feat =[
        1 if any(tld in url_element for tld in nl_tlds) else 0,
        1 if any(tld in url_element for tld in other_tlds) else 0
    ]
    urls_tokenized.append([split_sentence, custom_feat, url_element, 1])

for element in en_data[0:10000]:
    url_element = element['siteUrl'] + ('' if element['siteUrl'].endswith('/') else '/')
    split = [word for word in re.split('[^a-zA-Z]', url_element) 
        if len(word) >= 2 and word not in excluded_keywords]
    split_sentence = ' '.join(split)
    custom_feat =[
        1 if any(tld in url_element for tld in nl_tlds) else 0,
        1 if any(tld in url_element for tld in other_tlds) else 0
    ]
    urls_tokenized.append([split_sentence, custom_feat, url_element, 0])

In [22]:
print(urls_tokenized[0])

['tevansleen nl', [1, 0], 'https://www.tevansleen.nl/', 1]


In [23]:
# NOTE: NOT NECESSARY SINCE TRAIN_TEST_SPLIT ALREADY SHUFFLES VALUES

#Train Test split - ONLY RUN ONCE AFTER INITIALIZATION
# random.shuffle(urls_tokenized)

In [34]:
X_tokens, X_url, y = [], [], []
for url in urls_tokenized:
    X_tokens.append(url[0])
    X_url.append(url[2])
    y.append(url[3])

print(X_tokens[0], y[0])

vectorizer = CountVectorizer()
X_tokens = vectorizer.fit_transform(X_url)

print(X_tokens)

tevansleen nl 1
  (0, 1179)	1
  (0, 2909)	1
  (0, 2476)	1
  (0, 1722)	1
  (1, 2909)	1
  (1, 1722)	1
  (1, 1178)	1
  (1, 595)	1
  (2, 1179)	1
  (2, 1722)	1
  (2, 1189)	1
  (3, 1179)	1
  (3, 2909)	1
  (3, 1722)	1
  (3, 2870)	1
  (3, 2299)	1
  (4, 1179)	1
  (4, 2909)	1
  (4, 1722)	1
  (4, 2426)	1
  (5, 1179)	1
  (5, 2909)	1
  (5, 1722)	1
  (5, 2547)	1
  (6, 1179)	1
  :	:
  (1998, 1179)	1
  (1998, 2909)	1
  (1998, 594)	1
  (1998, 1176)	1
  (1998, 2127)	1
  (1998, 1652)	1
  (1998, 1799)	1
  (1998, 27)	1
  (1998, 911)	1
  (1998, 1821)	1
  (1998, 38)	1
  (1998, 11)	1
  (1998, 2274)	1
  (1998, 978)	1
  (1998, 227)	1
  (1998, 2624)	1
  (1998, 1250)	1
  (1998, 2688)	1
  (1998, 1075)	1
  (1999, 1179)	1
  (1999, 2909)	1
  (1999, 594)	1
  (1999, 912)	1
  (1999, 674)	1
  (1999, 116)	1


In [35]:
print(vectorizer.get_feature_names_out()[100])

3000054961


In [36]:
X_token_train, X_token_test, y_token_train, y_token_test = train_test_split(X_tokens, y, test_size=0.2)

In [37]:
# ccTLD - no training necessary because it just looks at TLD:
ccTLD_y = []
for url in X_url:
    ccTLD_y.append(1 if any(tld in url for tld in nl_tlds) else 0)

print(len(ccTLD_y))

# ccTLD+ Would give the same outcome as TLD in our case because it counts generic TLD's (.com/.net/.org) as non-dutch anyway.

2000


In [38]:
print(X_token_train[0])
print(y_token_train[0])

  (0, 1179)	1
  (0, 2909)	1
  (0, 366)	1
  (0, 1353)	1
1


In [44]:
# Token features
gnb = GaussianNB()
y_pred = gnb.fit(X_token_train.toarray(), y_token_train).predict(X_token_test.toarray())

In [45]:
# Custom features

In [46]:
confusion_matrix(y, ccTLD_y)

array([[1000,    0],
       [ 209,  791]])

In [47]:
print(y_pred)

[1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 0 0 1 0 1 1 1
 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0
 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0
 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1
 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1
 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1
 0 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 0 1 0]


In [48]:
confusion_matrix(y_token_test, y_pred)

array([[ 88, 108],
       [  2, 202]])

Without numeric characters: 
array([[ 94, 106],
       [  3, 197]])

With numeric characters:
array([[ 88, 108],
       [  2, 202]])