In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.corpus import words as dictionary
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

from joblib import dump, load

In [None]:
SEED = 42

In [None]:
# combine positives and negatives into dataframe

POS_PATH = 'data/pos_clean.txt'
NEG_PATH = 'data/neg_clean.txt'

with open(POS_PATH, 'r') as file:
    pos = file.readlines()

with open(NEG_PATH, 'r') as file:
    neg = file.readlines()

df_pos = pd.DataFrame({'text': pos})
df_neg = pd.DataFrame({'text': neg})

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None  # If no match is found

lemmatizer = WordNetLemmatizer()

# lemmatize
def lemmatize(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    lemmatized = []
    for word, tag in tags:
        tag = get_wordnet_pos(tag)
        if tag is None:
            continue
        word = lemmatizer.lemmatize(word, tag)
        lemmatized.append(word)
    text = " ".join(lemmatized)
    return text

# df_pos['text'] = df_pos['text'].apply(lemmatize)
# df_neg['text'] = df_neg['text'].apply(lemmatize)
df_pos['text'] = df_pos['text'].apply(lambda x: x.lower())
df_neg['text'] = df_neg['text'].apply(lambda x: x.lower())

df_pos['label'] = 1 # casino
df_neg['label'] = 0 # normal

df_pos, df_neg

In [None]:
# make dataset balanced

print("num datapoints", len(df_pos)+len(df_neg))
print("num positive datapoints", len(df_pos))
print("num negative datapoints", len(df_neg))

min_samples = min(len(df_pos), len(df_neg))

# undersample
df_pos = df_pos.sample(n=min_samples, random_state=SEED)
df_neg = df_neg.sample(n=min_samples, random_state=SEED)

print("-----------------------------------------------------------")
print("num datapoints", len(df_pos)+len(df_neg))
print("num positive datapoints", len(df_pos))
print("num negative datapoints", len(df_neg))

df_pos, df_neg

In [None]:
# train/test split

X = pd.concat([df_pos["text"], df_neg["text"]])
y = pd.concat([df_pos["label"], df_neg["label"]])

X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, y, test_size=0.3, random_state=SEED)

train_df = pd.concat([X_train_df, y_train_df], axis=1)
test_df = pd.concat([X_test_df, y_test_df], axis=1)

train_df, test_df

In [None]:
# remove stopwords (only english and german ones)

stop_words = stopwords.words('english') + stopwords.words('german')

def remove_stopwords_and_split(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

words_series = train_df['text'].apply(remove_stopwords_and_split)
print(words_series)
all_words = [word for words_list in words_series for word in words_list]

In [None]:
word_freq_df = pd.DataFrame({'word': all_words})
word_freq = word_freq_df['word'].value_counts().reset_index()
word_freq.columns = ['word', 'freq']

word_freq

In [None]:
# Remove non-valid words
print("num words before:", len(word_freq))
new_word_freq = word_freq[word_freq['word'].isin(dictionary.words())]
print("num words after:", len(new_word_freq))

In [None]:
# Remove words with frequency less than 10
print("num words before:", len(new_word_freq))
new_word_freq = new_word_freq[new_word_freq['freq'] >= 10]
print("num words after:", len(new_word_freq))

In [None]:
bag_of_words = new_word_freq['word'].tolist()

with open('bag_of_words.txt', 'w') as file:
    for word in bag_of_words:
        file.write(word + '\n')

In [None]:
def create_features(df, enabled=True):
    features = []
    for _, row in tqdm(df.iterrows(), disable=not enabled, total=len(df), desc="Processing rows"):
        text = row['text']
        website_words = text.split()
        website_words = list(dict.fromkeys(website_words))
        feature = [1 if w in website_words else 0 for w in bag_of_words]
        features.append(feature)
    df['features'] = features
    return df
train_df = create_features(train_df)
test_df = create_features(test_df)
train_df

In [None]:
X_train = np.array(train_df['features'].tolist())
y_train = np.array(train_df['label'].tolist())

X_test = np.array(test_df['features'].tolist())
y_test = np.array(test_df['label'].tolist())


# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

C = 1.0
model_svc = SVC(kernel='linear', C=C, probability=True).fit(X_train, y_train)

model_log = LogisticRegression()
model_log.fit(X_train, y_train)

model_for = RandomForestClassifier(n_estimators=100, random_state=SEED)
model_for.fit(X_train, y_train)

# Evaluate model

accuracy_svc = model_svc.score(X_test, y_test)
accuracy_log = model_log.score(X_test, y_test)
accuracy_for= model_for.score(X_test, y_test)
print(f"FOREST: {accuracy_for * 100:.2f}%")
print(f"LOGISTIC: {accuracy_log * 100:.2f}%")
print(f"SVM: {accuracy_svc * 100:.2f}%")

In [None]:
# save model weights
dump(model_for, 'weights_random_forest.joblib')

### Predict

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'de-DE,de;q=0.9',
    'Connection': 'keep-alive'
}
# 'Accept-Language': 'en-US,de;q=0.9',

urls = ["https://casino.draftkings.com/?page=1", "https://en.wikipedia.org/wiki/Elon_Musk", "https://slottyvegas.com/?page=1", "https://en.wikipedia.org/wiki/Casino_Royale_(2006_film)", "https://en.wikipedia.org/wiki/Poker", "https://www.lotterien.at/de", "https://en.wikipedia.org/wiki/Fake_News", "https://de.wikipedia.org/wiki/Fake_News", "https://en.wikipedia.org/wiki/Slot_machine", "https://www.gambling.net/history/", "https://en.wikipedia.org/wiki/Las_Vegas", "https://en.wikipedia.org/wiki/Pioneer_Club_Las_Vegas", "https://localhistories.org/history-of-gambling-how-people-started-gambling/", "https://eur.pokerstars.com/", "https://www.apa.org/monitor/2023/07/how-gambling-affects-the-brain", "https://www.ruetz.at/", "https://www.mpreis.at/vielfalt/unsere-marken/baguette", "https://corporate.target.com/careers"]

contents = []
for url in urls:
    response = requests.get(url, headers=headers)
    txt = ""
    if response.ok:
        soup = BeautifulSoup(response.content, 'html.parser')
        txt = soup.get_text().replace("\n", " ")
        contents.append(txt)

In [None]:
contents

In [None]:
# cont = """
# Baguette | MPREIS Order more, save more! Save up to 15 euros in the online store now Pick up a coupon Product or topic Search for products or content Search Login Favorite products PromotionsFoodDrinksDrugstoreInspirationMPREIS All products in promotion Flyer Online promotions MPREIS Specials Topics & novelties NoveltiesFill up stockUse instead of wasteEverything for Törggelen Show all Conscious nutrition Vegan and vegetarian foodGluten-free nutritionLactose-free nutritionFull Tyrol Show all Fruit ApplesBananasBerries Show all Vegetables PotatoesTomatoesCorn, Peppers & Chili Show all Milk & Eggs MilkPlant-based milk substitutesEggs Show all Meat & Sausage MeatSausageSausages Show all Frozen Ice creamPrepared mealsPizza & Baguette Show all Sweet & Salty ChocolateNibblesBiscuits & Biscuits Show all Canned food, Ready Meals & Soups Canned FoodPrepared MealsSoups Show all Bread & Pastries BreadPastriesCakes & Confectionery Show all Basic Food Cereals & MuesliPastaRice Show all To-Go SaladsSandwiches & WrapsDumplings Show all Fish Themes & Novelties Wine AdviceBeer in actionOrder more online Show all Water, Lemonades & energy drinks Mineral waterCola & lemonadesEnergy & sports drinks Show all Juices & syrups SyrupsVegetable juicesFruit juices & smoothies Show all Coffee, Tea & Cocoa CoffeeTeaCocoa Show all Beer Non-alcoholic BeerRadler Show all Wine & Sparkling Wine Sparkling Wine & Champagne Spirits WhiskyWodkaRum Show all Themes & Novelties Drugstore in actionEverything for your four-legged friendsEverything for your personal hygiene Show all Washing & cleaning agents Washing agentsCleaning agents Care Dental & oral hygiene & Oral CareHygieneFacial Care Show all Household Paper productsWaste bagsCandles Show all Pets Cat foodDog foodPet food Show all Baby DiapersBaby careBottles & Pacifiers Show all Flowers HouseplantsFlower seedsPlant accessories Show all Recipes Almond stars Railwayman's leek and cheese soup More about Recipes Stories 8. December open 1+1 day ski pass Axamer Lizum Advent market Bäckerei Therese Mölk More to Stories Regional Regional Tyrol - Currently in the range at MPREIS 20 years of BIO vom BERG Organic variety from South Tyrol More to Regional Recommendations for you Competition Order 4x - win a 500 EURO coupon 500 EURO coupon Recipe 1 dough 5 types of cookies About MPREIS Company & values Jobs & careers Locations & opening hours All about MPREIS Our brands Alpine butcher's shop Bakery Therese Mölk Baguette All our brands Sustainability & values Green mobility Energy Biodiversity More about Sustainability & values Recommendations for you Christmas YOU are the celebration. We take care of the enjoyment.  Brand: Bäckerei Therese Mölk THE WHOLE DAY FOR EVERY TASTE Baguette More than a bistro since 1989. Whether for coffee, breakfast, lunch, a quick snack between meals, cake or simply to buy bread - Baguette not only offers you all kinds of varied offers in over 170 branches in Tyrol, Salzburg, Vorarlberg, Carinthia and Upper Austria, but also the opportunity to take a relaxing break in our feel-good corners. \xa0 \xa0 Breakfast assortmentLunch dishesSustainabilityOpening hoursJobs Breakfast assortment From savory to sweet. Start the day with our delicious breakfast options and enjoy the second cup for free! Available daily until 11:00 am.                              Das Kleine 1 oven-fresh pastry* 2 spreads of your choice 1 hot drink (Fairtrade coffee, organic tea or cocoa) Second cup free * except Danish pastries \xa0 Das\xa0Süße 1 oven-fresh pastry, cake or strudel\xa0 1 hot drink (Fairtrade coffee, Organic coffee, organic tea or cocoa) Second cup free of charge \xa0 The vegetarian Also available vegan 1 vegetarian / vegan snack 1 hot drink (Fairtrade coffee, organic tea or cocoa) Second cup free of charge \xa0 The savory 1\xa0piquant filled snack (meat and cheese roll from 09: 00 am) 1 hot drink (Fairtrade coffee, organic tea or cocoa) Second cup free of charge \xa0 \xa0 Das\xa0Große 2\xa0ofenfrische Kleingebäcke* 3 spreads of your choice 1 hot drink (Fairtrade coffee, organic tea or cocoa) Second cup free of charge * except Danish pastries \xa0 F
# """
# contents = [cont]
# urls = ['tst.com']



models = [model_for, model_log, model_svc]
model_names = ["FOREST", "LOG", "SVM"]
for m, model in enumerate(models):
    print(f"Model: {model_names[m]}")
    for i, txt in enumerate(contents):
        txt_clean = txt.lower()
        txt_clean = txt_clean.split()
        txt_clean = list(set(txt_clean))
        txt_clean = ' '.join(txt_clean)
        # txt_clean = lemmatize(txt_clean)

        df_predict = pd.DataFrame([txt_clean], columns=['text'])

        bag_of_words = []
        with open('bag_of_words.txt', 'r') as file:
            bag_of_words = file.readlines()
        bag_of_words = [word.strip() for word in bag_of_words]
        df_predict = create_features(df_predict, False)

        feature_vec = np.array(df_predict['features'].tolist())

        # model = load('weights_random_forest.joblib')
        label = model.predict(feature_vec)
        prob = model.predict_proba(feature_vec)
        print(f"RESULT: {urls[i]}", label[0], prob[0][label[0]])
    print("--------------------------------------------------")