In [116]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.corpus import words as dictionary
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

from joblib import dump, load

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [117]:
SEED = 42

In [142]:
# combine positives and negatives into dataframe

POS_PATH = 'data/pos_clean.txt'
NEG_PATH = 'data/neg_clean.txt'

with open(POS_PATH, 'r') as file:
    pos = file.readlines()

with open(NEG_PATH, 'r') as file:
    neg = file.readlines()

df_pos = pd.DataFrame({'text': pos})
df_neg = pd.DataFrame({'text': neg})

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None  # If no match is found

lemmatizer = WordNetLemmatizer()

# lemmatize
def lemmatize(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    lemmatized = []
    for word, tag in tags:
        tag = get_wordnet_pos(tag)
        if tag is None:
            continue
        word = lemmatizer.lemmatize(word, tag)
        lemmatized.append(word)
    text = " ".join(lemmatized)
    return text

# df_pos['text'] = df_pos['text'].apply(lemmatize)
# df_neg['text'] = df_neg['text'].apply(lemmatize)
df_pos['text'] = df_pos['text'].apply(lambda x: x.lower())
df_neg['text'] = df_neg['text'].apply(lambda x: x.lower())


df_pos['label'] = 1 # casino
df_neg['label'] = 0 # normal

df_pos, df_neg

(                                                  text  label
 0    luckyland slots play 100 slots games online fo...      1
 1    vegas wild casino 300 50 free spins welcome bo...      1
 2    toggle navigationsportsbooklive bettingcasinoh...      1
 3    highroller.com roll with us!join nowlog incasi...      1
 4    live sports betting, live streaming, casino yo...      1
 ..                                                 ...    ...
 746  mypowerplay11homeabout ushow to playfantasy po...      1
 747  online horse betting go horse betting - since ...      1
 748  online casino slots, betting, bingo more! pafl...      1
 749  online casino queenspins - winners play herepr...      1
 750  best betting sites online betting sites betpac...      1
 
 [751 rows x 2 columns],
                                                   text  label
 0    strava running cycling hiking app - train trac...      0
 1    world edition - the atlanticskip to contentsit...      0
 2    envato - top digital a

In [143]:
# make dataset balanced

print("num datapoints", len(df_pos)+len(df_neg))
print("num positive datapoints", len(df_pos))
print("num negative datapoints", len(df_neg))

min_samples = min(len(df_pos), len(df_neg))

# undersample
df_pos = df_pos.sample(n=min_samples, random_state=SEED)
df_neg = df_neg.sample(n=min_samples, random_state=SEED)

print("-----------------------------------------------------------")
print("num datapoints", len(df_pos)+len(df_neg))
print("num positive datapoints", len(df_pos))
print("num negative datapoints", len(df_neg))

df_pos, df_neg

num datapoints 1273
num positive datapoints 751
num negative datapoints 522
-----------------------------------------------------------
num datapoints 1044
num positive datapoints 522
num negative datapoints 522


(                                                  text  label
 473  is ninja casino a scam?skip to contentmenuis c...      1
 357  parimatch casino best offers pokies onlinecasi...      1
 133  казино игри ᐉ онлайн казино безплатни хазартни...      1
 250  lottohelden.de: online lotto spielen beim test...      1
 299  access restricted access restricted voodoodrea...      1
 ..                                                 ...    ...
 151  online casino enjoy up to 400 and 400 free spi...      1
 403  situs joker123 slot terbaik resmi di indonesia...      1
 706  zakłady bukmacherskie - najlepszy bukmacher on...      1
 207  online sports betting - bet on sport matches g...      1
 633  revmasters - online sportsbook and casino affi...      1
 
 [522 rows x 2 columns],
                                                   text  label
 434  xbox official site: consoles games and communi...      0
 440  the hollywood reporter movie news tv news awar...      0
 6    noticias de última hor

In [144]:
# train/test split

X = pd.concat([df_pos["text"], df_neg["text"]])
y = pd.concat([df_pos["label"], df_neg["label"]])

X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, y, test_size=0.3, random_state=SEED)

train_df = pd.concat([X_train_df, y_train_df], axis=1)
test_df = pd.concat([X_test_df, y_test_df], axis=1)

train_df, test_df

(                                                  text  label
 48   sports betting online, your home for sportsboo...      1
 16   martha stewart recipes diy home decor crafts s...      0
 726  bets on great odds gamebookersgamebookers.com ...      1
 409  home u.s. department of labor skip to main con...      0
 333  actualités en direct et info en continu - le p...      0
 ..                                                 ...    ...
 445  spielen sie die besten online slots prime slot...      1
 278  jeux en ligne - loto-québecvoulez-vous vraimen...      1
 50   index - playojoeinloggenjetzt anmeldenpasswort...      1
 79   home livebingooffer terms:available to players...      1
 283  the keyword google product and technology news...      0
 
 [730 rows x 2 columns],
                                                   text  label
 427  huffpost - breaking news u.s. and world news h...      0
 15   play online bingo up to 200 bonus 30 spins cos...      1
 476  university of hawaii s

In [145]:
# remove stopwords (only english and german ones)

stop_words = stopwords.words('english') + stopwords.words('german')

def remove_stopwords_and_split(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

words_series = train_df['text'].apply(remove_stopwords_and_split)
print(words_series)
all_words = [word for words_list in words_series for word in words_list]

48     [sports, betting, online,, home, sportsbook, o...
16     [martha, stewart, recipes, diy, home, decor, c...
726    [bets, great, odds, gamebookersgamebookers.com...
409    [home, u.s., department, labor, skip, main, co...
333    [actualités, en, direct, et, info, en, continu...
                             ...                        
445    [spielen, besten, online, slots, prime, slotss...
278    [jeux, en, ligne, -, loto-québecvoulez-vous, v...
50     [index, -, playojoeinloggenjetzt, anmeldenpass...
79     [home, livebingooffer, terms:available, player...
283    [keyword, google, product, technology, news, s...
Name: text, Length: 730, dtype: object


In [146]:
word_freq_df = pd.DataFrame({'word': all_words})
word_freq = word_freq_df['word'].value_counts().reset_index()
word_freq.columns = ['word', 'freq']

word_freq

Unnamed: 0,word,freq
0,-,5317
1,online,3379
2,de,3301
3,casino,3190
4,new,2086
...,...,...
153276,snowboards,1
153277,ripple:,1
153278,cyrusher,1
153279,"odds,top",1


In [147]:
# Remove non-valid words
print("num words before:", len(word_freq))
new_word_freq = word_freq[word_freq['word'].isin(dictionary.words())]
print("num words after:", len(new_word_freq))

num words before: 153281
num words after: 11842


In [148]:
# Remove words with frequency less than 10
print("num words before:", len(new_word_freq))
new_word_freq = new_word_freq[new_word_freq['freq'] >= 10]
print("num words after:", len(new_word_freq))

num words before: 11842
num words after: 3413


In [149]:
bag_of_words = new_word_freq['word'].tolist()

with open('bag_of_words.txt', 'w') as file:
    for word in bag_of_words:
        file.write(word + '\n')

In [150]:
def create_features(df, enabled=True):
    features = []
    for _, row in tqdm(df.iterrows(), disable=not enabled, total=len(df), desc="Processing rows"):
        text = row['text']
        website_words = text.split()
        website_words = list(dict.fromkeys(website_words))
        feature = [1 if w in website_words else 0 for w in bag_of_words]
        features.append(feature)
    df['features'] = features
    return df
train_df = create_features(train_df)
test_df = create_features(test_df)
train_df

Processing rows: 100%|██████████| 730/730 [00:20<00:00, 35.74it/s]
Processing rows: 100%|██████████| 314/314 [00:09<00:00, 32.67it/s]


Unnamed: 0,text,label,features
48,"sports betting online, your home for sportsboo...",1,"[0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, ..."
16,martha stewart recipes diy home decor crafts s...,0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, ..."
726,bets on great odds gamebookersgamebookers.com ...,1,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
409,home u.s. department of labor skip to main con...,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, ..."
333,actualités en direct et info en continu - le p...,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
...,...,...,...
445,spielen sie die besten online slots prime slot...,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
278,jeux en ligne - loto-québecvoulez-vous vraimen...,1,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
50,index - playojoeinloggenjetzt anmeldenpasswort...,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
79,home livebingooffer terms:available to players...,1,"[0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [152]:
X_train = np.array(train_df['features'].tolist())
y_train = np.array(train_df['label'].tolist())

X_test = np.array(test_df['features'].tolist())
y_test = np.array(test_df['label'].tolist())


# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

C = 1.0
model_svc = SVC(kernel='linear', C=C, probability=True).fit(X_train, y_train)

model_log = LogisticRegression()
model_log.fit(X_train, y_train)

model_for = RandomForestClassifier(n_estimators=100, random_state=SEED)
model_for.fit(X_train, y_train)

# Evaluate model

accuracy_svc = model_svc.score(X_test, y_test)
accuracy_log = model_log.score(X_test, y_test)
accuracy_for= model_for.score(X_test, y_test)
print(f"FOREST: {accuracy_for * 100:.2f}%")
print(f"LOGISTIC: {accuracy_log * 100:.2f}%")
print(f"SVM: {accuracy_svc * 100:.2f}%")

FOREST: 97.13%
LOGISTIC: 97.13%
SVM: 94.27%


In [154]:
# save model weights
dump(model_log, 'weights_random_forest.joblib')

['weights_random_forest.joblib']

### Predict

In [155]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive'
}

urls = ["https://casino.draftkings.com/?page=1", "https://en.wikipedia.org/wiki/Elon_Musk", "https://slottyvegas.com/?page=1", "https://en.wikipedia.org/wiki/Casino_Royale_(2006_film)", "https://en.wikipedia.org/wiki/Poker", "https://www.lotterien.at/de", "https://en.wikipedia.org/wiki/Misinformation", "https://en.wikipedia.org/wiki/Slot_machine", "https://www.gambling.net/history/", "https://en.wikipedia.org/wiki/Las_Vegas", "https://en.wikipedia.org/wiki/Pioneer_Club_Las_Vegas", "https://localhistories.org/history-of-gambling-how-people-started-gambling/", "https://eur.pokerstars.com/", "https://www.apa.org/monitor/2023/07/how-gambling-affects-the-brain"]

contents = []
for url in urls:
    response = requests.get(url, headers=headers)
    txt = ""
    if response.ok:
        soup = BeautifulSoup(response.content, 'html.parser')
        txt = soup.get_text().replace("\n", " ")
        contents.append(txt)

In [156]:
models = [model_for, model_log, model_svc]
model_names = ["FOREST", "LOG", "SVM"]
for m, model in enumerate(models):
    print(f"Model: {model_names[m]}")
    for i, txt in enumerate(contents):
        txt_clean = txt.lower()
        txt_clean = txt_clean.split()
        txt_clean = list(set(txt_clean))
        txt_clean = ' '.join(txt_clean)
        # txt_clean = lemmatize(txt_clean)

        df_predict = pd.DataFrame([txt_clean], columns=['text'])

        bag_of_words = []
        with open('bag_of_words.txt', 'r') as file:
            bag_of_words = file.readlines()
        bag_of_words = [word.strip() for word in bag_of_words]
        df_predict = create_features(df_predict, False)

        feature_vec = np.array(df_predict['features'].tolist())

        label = model.predict(feature_vec)
        prob = model.predict_proba(feature_vec)
        print(f"RESULT: {urls[i]}", label[0], prob[0][label[0]])
    print("--------------------------------------------------")

Model: FOREST
RESULT: https://casino.draftkings.com/?page=1 1 0.9376666666666666
RESULT: https://en.wikipedia.org/wiki/Elon_Musk 0 0.85
RESULT: https://slottyvegas.com/?page=1 1 0.7793333333333332
RESULT: https://en.wikipedia.org/wiki/Casino_Royale_(2006_film) 0 0.66
RESULT: https://en.wikipedia.org/wiki/Poker 1 0.55
RESULT: https://www.lotterien.at/de 1 0.872404761904762
RESULT: https://en.wikipedia.org/wiki/Misinformation 0 0.92
RESULT: https://en.wikipedia.org/wiki/Slot_machine 1 0.58
RESULT: https://www.gambling.net/history/ 1 0.71
RESULT: https://en.wikipedia.org/wiki/Las_Vegas 0 0.63
RESULT: https://en.wikipedia.org/wiki/Pioneer_Club_Las_Vegas 0 0.61
RESULT: https://localhistories.org/history-of-gambling-how-people-started-gambling/ 1 0.51
RESULT: https://eur.pokerstars.com/ 1 0.67
RESULT: https://www.apa.org/monitor/2023/07/how-gambling-affects-the-brain 0 0.59
--------------------------------------------------
Model: LOG
RESULT: https://casino.draftkings.com/?page=1 0 0.6517049