In [49]:
import json
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [50]:
nl_f = open('../data/nl.json')
en_f = open('../data/en.json')
nl_data = json.load(nl_f)
en_data = json.load(en_f)

In [73]:
urls_tokenized = []
excluded_keywords = ['www', 'index', 'html', 'htm', 'html', 'http', 'https']
nl_tlds = ['.nl/', '.be/', '.su/', '.aw/', '.sx/', '.cw/']
other_tlds = ['.com/', '.net/', '.org/']

for element in nl_data[0:5000]:
    url_element = element['siteUrl'] + ('' if element['siteUrl'].endswith('/') else '/')
    split = [word for word in re.split('[^a-zA-Z]', url_element) 
        if len(word) >= 2 and word not in excluded_keywords]
    split_sentence = ' '.join(split)
    custom_feat =[
        1 if any(tld in url_element for tld in nl_tlds) else 0,
        1 if any(tld in url_element for tld in other_tlds) else 0
    ]
    urls_tokenized.append([split_sentence, custom_feat, url_element, 1])

for element in en_data[0:5000]:
    url_element = element['siteUrl'] + ('' if element['siteUrl'].endswith('/') else '/')
    split = [word for word in re.split('[^a-zA-Z]', url_element) 
        if len(word) >= 2 and word not in excluded_keywords]
    split_sentence = ' '.join(split)
    custom_feat =[
        1 if any(tld in url_element for tld in nl_tlds) else 0,
        1 if any(tld in url_element for tld in other_tlds) else 0
    ]
    urls_tokenized.append([split_sentence, custom_feat, url_element, 0])

In [74]:
print(urls_tokenized[0])

['tevansleen nl', [1, 0], 'https://www.tevansleen.nl/', 1]


In [75]:
# NOTE: NOT NECESSARY SINCE TRAIN_TEST_SPLIT ALREADY SHUFFLES VALUES

#Train Test split - ONLY RUN ONCE AFTER INITIALIZATION
# random.shuffle(urls_tokenized)

In [76]:
X_tokens, X_url, y = [], [], []
for url in urls_tokenized:
    X_tokens.append(url[0])
    X_url.append(url[2])
    y.append(url[3])

print(X_tokens[0], y[0])

vectorizer = CountVectorizer()
X_tokens = vectorizer.fit_transform(X_tokens)

print(X_tokens)

tevansleen nl 1
  (0, 10558)	1
  (0, 7549)	1
  (1, 7549)	1
  (1, 2194)	1
  (2, 7549)	1
  (2, 5052)	1
  (3, 7549)	1
  (3, 12101)	1
  (3, 9710)	1
  (4, 7549)	1
  (4, 10274)	1
  (5, 7549)	1
  (5, 10851)	1
  (6, 7549)	1
  (6, 10551)	1
  (7, 894)	1
  (7, 2193)	1
  (8, 7549)	1
  (8, 1420)	1
  (9, 2193)	1
  (9, 9813)	1
  (10, 7549)	1
  (10, 9998)	1
  (10, 5152)	1
  (10, 4762)	1
  :	:
  (9992, 963)	1
  (9993, 2193)	1
  (9993, 8362)	1
  (9993, 11885)	1
  (9994, 2193)	1
  (9994, 433)	1
  (9994, 7781)	1
  (9994, 2603)	1
  (9995, 2193)	1
  (9995, 1314)	1
  (9995, 784)	1
  (9996, 2193)	1
  (9996, 10912)	1
  (9996, 5642)	1
  (9996, 5641)	1
  (9997, 2193)	2
  (9997, 3033)	1
  (9997, 5013)	1
  (9997, 8589)	1
  (9997, 1875)	1
  (9997, 9281)	1
  (9998, 3961)	1
  (9998, 8426)	1
  (9999, 2193)	1
  (9999, 12273)	1


In [77]:
print(vectorizer.get_feature_names_out()[100])

actcult


In [78]:
X_token_train, X_token_test, y_token_train, y_token_test = train_test_split(X_tokens, y, test_size=0.2)

In [79]:
# ccTLD - no training necessary because it just looks at TLD:
ccTLD_y = []
for url in X_url:
    ccTLD_y.append(1 if any(tld in url for tld in nl_tlds) else 0)

print(len(ccTLD_y))

# ccTLD+ Would give the same outcome as TLD in our case because it counts generic TLD's (.com/.net/.org) as non-dutch anyway.

10000


In [80]:
print(X_token_train[0])
print(y_token_train[0])

  (0, 2612)	1
  (0, 3963)	1
0


In [81]:
# Token features
gnb = GaussianNB()
y_pred = gnb.fit(X_token_train.toarray(), y_token_train).predict(X_token_test.toarray())

In [82]:
# Custom features

In [70]:
confusion_matrix(y, ccTLD_y)

array([[4924,   76],
       [ 840, 4160]])

In [83]:
print(y_pred)

[1 0 1 ... 1 1 1]


In [84]:
confusion_matrix(y_token_test, y_pred)

array([[532, 485],
       [ 26, 957]])

Without numeric characters: 
array([[ 94, 106],
       [  3, 197]])

With numeric characters:
array([[ 88, 108],
       [  2, 202]])