In [None]:
import re
import os
import torch
import string
import random
import warnings
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from simpletransformers.classification import ClassificationModel
warnings.filterwarnings('ignore')

In [None]:
def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

SEED = 2020
fix_seed(SEED)

In [None]:
df_train = pd.read_csv("./input/train_data.csv")
df_test = pd.read_csv("./input/test_data.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
stop = set(stopwords.words("english"))
punctuation = list(string.punctuation)
org_stop = ["Subject"]
add_stop = punctuation + org_stop
stop.update(add_stop)

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def remove_urls(text):
    return re.sub(r'http\S+', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            if i.strip().isalpha():
                final_text.append(i.strip())
    return " ".join(final_text)


def denoise_text(text):
    text = strip_html(text)
    text = remove_urls(text)
    text = remove_stopwords(text)
    return text


df_train['contents'] = df_train['contents'].apply(denoise_text)
df_test['contents'] = df_test["contents"].apply(denoise_text)

In [None]:
X = df_train['contents']
y = df_train['y']

In [None]:
N_FOLD = 10
NUM = 10

test_length = df_test.shape[0]
X_pseudo = np.copy(X)
y_pseudo = np.copy(y)
vec_count = CountVectorizer(min_df=3)
vec_count.fit(pd.concat([X, df_test.contents]))

for num in range(NUM):
    result_spm = np.zeros(test_length)
    predictions = np.zeros(test_length)
    skf = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    for train_index, val_index in skf.split(X_pseudo, y_pseudo):
        X_train = X_pseudo[train_index]
        y_train = y_pseudo[train_index]
        X_val = X_pseudo[val_index]
        y_val = y_pseudo[val_index]

        X_vec = vec_count.transform(X_train)
        X_val_vec = vec_count.transform(X_val)

        model = MultinomialNB()
        model.fit(X_vec, y_train)

        test_vec = vec_count.transform(df_test['contents'])
        result_spm += model.predict_proba(test_vec)[:, 1].flatten() / N_FOLD

    spm_indices = np.where(result_spm > 0.995)[0]
    pseudo_spm_contents = df_test.iloc[spm_indices].contents
    pseudo_spm_labels = pd.DataFrame(np.ones(len(spm_indices)).flatten())
    X_pseudo = pd.concat([X, pseudo_spm_contents]).reset_index().contents
    y_pseudo = pd.concat([y, pseudo_spm_labels]).values

In [None]:
N_FOLD = 7
result = np.zeros(test_length)
SEED = [41, 42, 43]

for seed in SEED:
    skf = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=seed)
    params = {
        "output_dir": "outputs/",
        "overwrite_output_dir": True,
        "max_seq_length": 128,
        "train_batch_size": 32,
        "num_train_epochs": 3,
        "manual_seed": seed,
    }
    for train_index, val_index in skf.split(X_pseudo, y_pseudo):
        X_train = X_pseudo[train_index]
        y_train = y_pseudo[train_index]
        X_val = X_pseudo[val_index]
        y_val = y_pseudo[val_index]
        model = ClassificationModel('roberta', 'roberta-base', args=params)
        df_train_ = pd.DataFrame(
            pd.concat([pd.Series(X_train).reset_index(drop=True), pd.Series(y_train.flatten()).reset_index(drop=True)], axis=1, ignore_index=True))
        model.train_model(df_train_)
        predictions, raw_outputs = model.predict(df_test['contents'])
        result += raw_outputs[:, 1] / (N_FOLD*len(SEED))

In [None]:
K = 17000
unsorted_max_indices = np.argpartition(-result, K)[:K]

y = result[unsorted_max_indices]

indices = np.argsort(-y)

max_k_indices = unsorted_max_indices[indices]
predictions = np.zeros(len(result))
for i in range(0, len(result)):
    if i in max_k_indices:
        predictions[i] = 1
    else:
        predictions[i] = 0

In [None]:
print(np.sum(predictions))

In [None]:
sub = pd.DataFrame(df_test["id"])
sub["y"] = predictions

In [None]:
sub.to_csv("submission.csv", index=False)
sub.head()