In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sklearn.model_selection  # import train_test_split
import sklearn.metrics  # import accuracy_score
import sklearn.feature_extraction.text  # import CountVectorizer
import xgboost  # import XGBClassifier
import pandas as pd
import os
import numpy as np

import string
import re
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

import catboost  # import CatBoostClassifier, Pool

import nltk.tokenize  # import word_tokenize
import nltk.corpus  # import stopwords
import nltk.stem  # import WordNetLemmatizer
import nltk.corpus  # import wordnet
import nltk.tokenize  # import sent_tokenize
import statistics  # import mean
import sentence_transformers  # import SentenceTransformer

import sklearn.metrics  # import roc_auc_score

import torch
import tqdm  # import tqdm
import matplotlib  # import style

os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import sys

sys.path.append("/p/home/jusers/ehlert1/juwels/notebooks/bootcamp_testing/scripts")
sys.path.append("../../scripts")
import normalize_text_bootcamp
import utils_bootcamp
import dataset_bootcamp
import plotting

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [None]:
[torch.cuda.device(i) for i in range(torch.cuda.device_count())]

In [None]:
!echo $CUDA_VISIBLE_DEVICES

In [None]:
FOLDER_TO_TWEETS = "/p/project/training2223/a2/data/tweets/tweets_2017_normalized_filtered.nc"
FOLDER_TO_TWEETS = "../../../data/tweets/tweets_2017_normalized_filtered.nc"

In [None]:
ds_tweets = dataset_bootcamp.load_tweets_dataset(FOLDER_TO_TWEETS)

In [None]:
# using first 10_001 values to iterate more quickly + resetting index
ds_sel = ds_tweets.sel(index=slice(0, 10_000))
ds_sel = dataset_bootcamp.reset_index_coordinate(ds_sel)

In [None]:
# dataset seems balanced
ds_sel.raining.plot.hist()

In [None]:
df = ds_sel[["text_normalized"]].to_pandas()
df.reset_index(inplace=True, drop=True)
df

In [None]:
FOLDS = 5

In [None]:
STRANSFORMERS = {
    "sentence-transformers/paraphrase-mpnet-base-v2": ("mpnet", 768),
    "sentence-transformers/bert-base-wikipedia-sections-mean-tokens": (
        "wikipedia",
        768,
    ),
}

In [None]:
def get_encode_test(df):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
    model = sentence_transformers.SentenceTransformer(
        "sentence-transformers/paraphrase-mpnet-base-v2", cache_folder=f"./hf_mpnet/"
    )
    model.to(device)
    model.eval()
    return np.array(model.encode(df["text_normalized"]))


# get_encode(df)

In [None]:
def get_encode(df, encoder, name):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
    model = sentence_transformers.SentenceTransformer(encoder, cache_folder=f"./hf_{name}/")
    model.to(device)
    model.eval()
    return np.array(model.encode(df["text_normalized"].to_numpy()))


def get_embeddings(df, emb=None, tolist=True):
    ret = pd.DataFrame(index=df.index)

    for e, s in STRANSFORMERS.items():
        if emb and s[0] not in emb:
            continue

        ret[s[0]] = list(get_encode(df, e, s[0]))
        if tolist:
            ret = pd.concat(
                [
                    ret,
                    pd.DataFrame(
                        ret[s[0]].tolist(),
                        columns=[f"{s[0]}_{x}" for x in range(s[1])],
                        index=ret.index,
                    ),
                ],
                axis=1,
                copy=False,
                sort=False,
            )

    return ret


def clean_text(text):
    table = text.maketrans(dict.fromkeys(string.punctuation))

    words = word_tokenize(text.lower().strip().translate(table))
    words = [word for word in words if word not in stopwords.words("english")]
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in words]
    return " ".join(lemmed)


def get_sentence_lengths(text):
    tokened = sent_tokenize(text)
    lengths = []

    for idx, i in enumerate(tokened):
        splited = list(i.split(" "))
        lengths.append(len(splited))

    return (max(lengths), min(lengths), round(mean(lengths), 3))


def create_features(df):
    df_f = pd.DataFrame(index=df.index)
    df_f["text_len"] = df["excerpt"].apply(len)
    df_f["text_clean_len"] = df["clean_excerpt"].apply(len)
    df_f["text_len_div"] = df_f["text_clean_len"] / df_f["text_len"]
    df_f["text_word_count"] = df["clean_excerpt"].apply(lambda x: len(x.split(" ")))

    df_f[["max_len_sent", "min_len_sent", "avg_len_sent"]] = df.apply(
        lambda x: get_sentence_lengths(x["excerpt"]), axis=1, result_type="expand"
    )

    return df_f


def getWordsFromURL(url):
    return re.compile(r"[\:/?=\-&.]+", re.UNICODE).split(url)

In [None]:
tpo = {
    "tokenizers": [
        {
            "tokenizer_id": "Sense",
            "separator_type": "BySense",
            "lowercasing": "True",
            "token_types": ["Word", "Number"],
        }
    ],
    "dictionaries": [
        {
            "dictionary_id": "Word",
            "token_level_type": "Word",
            "occurrence_lower_bound": "2",
        },
        {
            "dictionary_id": "Bigram",
            "token_level_type": "Word",
            "gram_order": "2",
            "occurrence_lower_bound": "2",
        },
        {
            "dictionary_id": "Trigram",
            "token_level_type": "Word",
            "gram_order": "3",
            "occurrence_lower_bound": "2",
        },
    ],
    "feature_processing": {
        "0": [
            {
                "tokenizers_names": ["Sense"],
                "dictionaries_names": ["Word"],
                "feature_calcers": ["BoW"],
            },
            {
                "tokenizers_names": ["Sense"],
                "dictionaries_names": ["Bigram"],
                "feature_calcers": ["BoW"],
            },
        ]
    },
}

tpo_2 = {
    "tokenizers": [
        {
            "tokenizer_id": "Sense",
            "separator_type": "BySense",
            "lowercasing": "True",
            "token_types": ["Word", "Number"],
        }
    ],
    "dictionaries": [
        {
            "dictionary_id": "Word",
            "token_level_type": "Word",
            "occurrence_lower_bound": "2",
        },
        {
            "dictionary_id": "Bigram",
            "token_level_type": "Word",
            "gram_order": "2",
            "occurrence_lower_bound": "2",
        },
        {
            "dictionary_id": "Trigram",
            "token_level_type": "Word",
            "gram_order": "3",
            "occurrence_lower_bound": "2",
        },
    ],
    "feature_processing": {
        "0": [
            {
                "tokenizers_names": ["Sense"],
                "dictionaries_names": ["Word"],
                "feature_calcers": ["BoW", "BM25"],
            },
            {
                "tokenizers_names": ["Sense"],
                "dictionaries_names": ["Bigram", "Trigram"],
                "feature_calcers": ["BoW"],
            },
        ],
        "1": [
            {
                "tokenizers_names": ["Sense"],
                "dictionaries_names": ["Word"],
                "feature_calcers": ["BoW"],
            },
            {
                "tokenizers_names": ["Sense"],
                "dictionaries_names": ["Bigram"],
                "feature_calcers": ["BoW"],
            },
        ],
    },
}

In [None]:
def fit_model_classifier(train_pool, test_pool, **kwargs):
    model = catboost.CatBoostClassifier(
        iterations=5000,
        eval_metric="AUC",
        od_type="Iter",
        od_wait=500,
        l2_leaf_reg=10,
        bootstrap_type="Bernoulli",
        subsample=0.7,
        thread_count=20,
        **kwargs,
    )

    return model.fit(train_pool, eval_set=test_pool, verbose=100, plot=False, use_best_model=True)


def get_oof_classifier(
    n_folds,
    x_train,
    y,
    embedding_features,
    cat_features,
    text_features,
    tpo,
    seeds,
    num_bins,
    emb=None,
    tolist=True,
    gpu=True,
):
    ntrain = x_train.shape[0]

    oof_train = np.zeros((len(seeds), ntrain, num_bins))
    models = {}
    for iseed, seed in enumerate(seeds):
        kf = sklearn.model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        for i, (tr_i, t_i) in enumerate(kf.split(x_train, y)):
            print(f"tr_i:{tr_i}, t_i:{t_i}")
            if emb and len(emb) > 0:
                x_tr = pd.concat(
                    [
                        x_train.iloc[tr_i, :],
                        get_embeddings(x_train.iloc[tr_i, :], emb, tolist),
                    ],
                    axis=1,
                    copy=False,
                    sort=False,
                )
                x_te = pd.concat(
                    [
                        x_train.iloc[t_i, :],
                        get_embeddings(x_train.iloc[t_i, :], emb, tolist),
                    ],
                    axis=1,
                    copy=False,
                    sort=False,
                )
                columns = [x for x in x_tr if (x not in ["text_normalized"])]
                if not embedding_features:
                    for c in emb:
                        columns.remove(c)
            else:
                x_tr = x_train.iloc[tr_i, :]
                x_te = x_train.iloc[t_i, :]
                columns = [x for x in x_tr if (x not in ["text_normalized"])]
            x_tr = x_tr[columns]
            x_te = x_te[columns]
            y_tr = y[tr_i]
            y_te = y[t_i]
            train_pool = catboost.Pool(
                data=x_tr,
                label=y_tr,
                cat_features=cat_features,
                embedding_features=embedding_features,
                text_features=text_features,
            )
            valid_pool = catboost.Pool(
                data=x_te,
                label=y_te,
                cat_features=cat_features,
                embedding_features=embedding_features,
                text_features=text_features,
            )
            task_type = "GPU" if gpu else "CPU"
            model = fit_model_classifier(
                train_pool,
                valid_pool,
                random_seed=seed,
                task_type=task_type,
                text_processing=tpo,
            )
            oof_train[iseed, t_i, :] = model.predict_proba(valid_pool)
            models[(seed, i)] = model

    oof_train = oof_train.mean(axis=0)

    return oof_train, models

In [None]:
params = {}

for gpu in [False]:
    postfix = "_gpu" if gpu else ""

    params.update(
        {
            "emb_basic_f_columns"
            + postfix: {
                "x_train": df,
                "embedding_features": ["mpnet", "wikipedia"],
                "text_features": None,
                "tpo": tpo,
                "emb": ["mpnet", "wikipedia"],
                "tolist": False,
                "gpu": gpu,
            }
        }
    )

results = {}

for k, v in params.items():
    results[k] = get_oof_classifier(
        n_folds=FOLDS, y=ds_sel["raining"].values, cat_features=None, seeds=[0, 42, 888], num_bins=2, **v
    )

In [None]:
v[0].argmax(-1)

In [None]:
res = {}
for k, v in results.items():
    auc = sklearn.metrics.roc_auc_score(ds_sel["raining"].values, v[0].argmax(-1))  # , multi_class="ovo")
    if "gpu" in str(k):
        name = k[0:-4]
        # auc_gpu = sklearn.metrics.roc_auc_score(ds_sel['raining'].values, v[0].argmax(-1), multi_class="ovo")
    else:
        # auc_cpu = sklearn.metrics.roc_auc_score(ds_sel['raining'].values, v[0].argmax(-1), multi_class="ovo")
        name = k
    if name in res:
        res[name] = res[name] + [auc]
    else:
        res[name] = [auc]
pd.DataFrame.from_dict(res, orient="index", columns=["AUC(CPU)"])